Kernels:

kernels-community
/

deep-gemm

Trusted publisher

Kernel card Files Files and versions

xet

Community

kernels-bot commited on 5 days ago

Commit

92f2707

verified ·

1 Parent(s): 96ee59b

Uploaded using `kernel-builder` (batch 11/32).

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h +0 -2119
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/iterators/transpose_warp_iterator.h +0 -55
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/iterators/warp_iterator_from_smem.h +0 -283
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/kernel_backward.h +0 -0
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/kernel_forward.h +0 -1322
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/piped_subprocess.py +0 -144
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/transform/tile_smem_loader.h +0 -90
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_bias_act_epilogue_tensor_op.h +0 -154
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_thread_map_tensor_op_for_fused_bias.h +0 -113
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h +0 -213
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/output_tile_thread_map_for_fused_bias.h +0 -311
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/warp/fused_bias_act_fragment_iterator_tensor_op.h +0 -189
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/fixed_impl/gemm/warp/mma_tensor_op_fragment_iterator_without_output_op.h +0 -427
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_all_code.py +0 -129
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_cmake.py +0 -131
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_customized_epilogue.py +0 -120
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py +0 -469
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_ir.py +0 -249
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py +0 -476
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_sample.py +0 -232
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py +0 -1013
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_turing_and_volta.py +0 -456
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py +0 -92
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py +0 -135
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/replace_fix_impl_header.py +0 -67
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/leaky_bias.h +0 -292
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/utils.h +0 -94
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/device/dual_gemm.h +0 -499
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/dual_gemm_common.h +0 -52
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/dual_gemm_run.h +0 -938
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/kernel/dual_gemm.h +0 -545
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/test_run.h +0 -95
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/thread/left_silu_and_mul.h +0 -150
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/threadblock/dual_epilogue.h +0 -424
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/threadblock/dual_mma_base.h +0 -232
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/threadblock/dual_mma_multistage.h +0 -775
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/51_hopper_gett/gett_kernel.cuh +0 -139
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp +0 -421
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh +0 -136
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/52_hopper_gather_scatter_fusion/scatter_epilogue.hpp +0 -222
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/53_hopper_gemm_permute/permute_kernel.cuh +0 -92
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/53_hopper_gemm_permute/permute_traits.hpp +0 -274
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/54_hopper_fp8_warp_specialized_gemm/hopper_fp8_commandline.hpp +0 -129
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp +0 -246
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/59_ampere_gather_scatter_conv/ampere_conv_kernel.h +0 -320
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/63_hopper_gemm_with_weight_prefetch/collective/builder.hpp +0 -242
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/63_hopper_gemm_with_weight_prefetch/collective/dispatch_policy_extra.hpp +0 -61
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/63_hopper_gemm_with_weight_prefetch/collective/sm90_mma_tma_gmma_ss_warpspecialized_with_prefetch.hpp +0 -871
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/63_hopper_gemm_with_weight_prefetch/gemm_with_weight_prefetch_commandline.hpp +0 -117
build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp +0 -561

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h DELETED Viewed

@@ -1,2119 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Templates implementing loading of tiles from pitch-linear rank=2
-   tensors.
-    This iterator uses masks to guard out-of-bounds accesses. The first tile
-   this iterator visits maybe partial, then the remaining tiles are complete.
-   So, we only need to compute the predicates twice, once before the first tile
-   and once for the remaining full tiles which can share the same predicates.
-    A precomputed "Params" object minimizes the amount of state that must be
-   stored in registers, and integer addition is used to advance the pointer
-   through memory.
-*/
-#pragma once
-#include "cutlass/arch/memory.h"
-#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace transform {
-namespace threadblock {
-////////////////////////////////////////////////////////////////////////////////
-/// PredicatedTileIteratorResidualLast
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-/// Regular tile iterator using a precomputed control structure to minimize
-/// register liveness and integer arithmetic.
-///
-/// Layout is assumed to be invariant at the time the precomputed "Params"
-/// object is constructed.
-///
-/// Base pointer and tensor extents may be specified at the time the iterator is
-/// constructed. Subsequently, they are assumed to be immutable.
-///
-/// Adding a logical coordinate offset may be performed at the time the iterator
-/// is constructed. Subsequent additions to logical coordinate offset may be
-/// performed but are relatively expensive.
-///
-/// Visitation order is intended to first visit a "residual" tile that may be
-/// partially full in both the advance dimension and the steady-state dimension.
-/// This is assumed to be the last tile in the iteration sequence. Advancing an
-/// iterator that has just been constructed moves to the first tile that is full
-/// in the advance dimension and recomputes predicates. Subsequent accesses may
-/// be performed without updating internal predicates and are efficient in terms
-/// of live register state and pointer arithmetic instructions.
-///
-/// To be efficient, this assumes the iterator will be dereferenced and advanced
-/// at least once outside any looping structure to minimize integer arithmetic.
-///
-/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to
-/// dereferencing the iterator.
-///
-///
-/// Example:
-///
-/// An efficient pipeline structure may be constructed as follows:
-///
-// template <typename Iterator>
-// __global__ void kernel(
-//   typename Iterator::Params params,
-//   typename Iterator::Element *ptr,
-//   TensorCoord extent) {
-//
-//   typename Iterator::Fragment fragment;
-//
-//   TensorCoord threadblock_offset(0, 0);
-//
-//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
-//
-//
-//   fragment = *iter;        // load "residue" tile first
-//   ++iter;                  // advance to first "steady state" tile and update
-//   internal masks
-//
-//
-//   #pragma unroll
-//   for (int i = Remaining - 1; i >= 0; --i) {
-//
-//     f(fragment);
-//
-//     if (!i) {
-//       iter.clear_mask();   // light-weight operation to clear masks -
-//       subsequent loads become NO-OPs.
-//     }
-//
-//     fragment = *iter;      // load tile during "steady state" phase
-//     ++iter;                // advance to next tile - lightweight due to
-//     steady-state masks
-//   }
-// }
-//
-// void host(TensorView<Element, 2, layout::PitchLinear> view) {
-//
-//   using Iterator =
-//   transform::threadblock::PredicatedTileIteratorResidualLast;
-//
-//   typename Iterator::Params params(view.layout());
-//
-//   kernel<Iterator>(params, view.data());
-// }
-///
-///
-template <
-    typename Shape,
-    typename Element,
-    typename Layout,
-    int AdvanceRank,
-    typename ThreadMap,
-    int AccessSize = ThreadMap::kElementsPerAccess,
-    bool Gather = false>
-class PredicatedTileIteratorResidualLast;
-////////////////////////////////////////////////////////////////////////////////
-/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-    typename Shape_,
-    typename Element_,
-    int AdvanceRank,
-    typename ThreadMap_,
-    int AccessSize,
-    bool Gather>
-class PredicatedTileIteratorResidualLast<
-    Shape_,
-    Element_,
-    layout::PitchLinear,
-    AdvanceRank,
-    ThreadMap_,
-    AccessSize,
-    Gather> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::PitchLinear;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Pointer = Element*;
-  using NonConstPointer = typename platform::remove_const<Element>::type*;
-  /// Type used for internal memory accesses
-  using AccessType = AlignedArray<
-      Element,
-      AccessSize,
-      (AccessSize * sizeof_bits<Element>::value / 8)>;
-  /// Underlying iterator to compute the addresses
-  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
-      Shape,
-      Element,
-      Layout,
-      kAdvanceRank,
-      ThreadMap,
-      AccessType,
-      Gather>;
-  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<
-      Element,
-      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename TileAccessIterator::Mask;
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   public:
-    using Base = typename TileAccessIterator::Params::Base;
-    friend PredicatedTileIteratorResidualLast;
-   private:
-    /// Parameters object
-    typename TileAccessIterator::Params params_;
-   public:
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const& layout) : params_(layout) {}
-    CUTLASS_HOST_DEVICE
-    Params() {}
-    CUTLASS_HOST_DEVICE
-    Params(Base const& base) : params_(base) {}
-  };
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char*;
- private:
-  //
-  // Data members
-  //
-  /// Data member to the tile access iterator
-  TileAccessIterator address_iterator_;
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      /// Precomputed parameters object
-      Params const& params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const& threadblock_offset,
-      /// Gather indices
-      int const* indices = nullptr)
-      : address_iterator_(
-            params.params_,
-            pointer,
-            extent,
-            thread_id,
-            threadblock_offset,
-            indices) {}
-  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
-  /// offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      Params const& params, ///< Precomputed parameters object
-      Pointer pointer, ///< Pointer to start of tensor
-      TensorCoord extent, ///< Extent of tensor
-      int thread_id ///< ID of each participating thread
-      )
-      : PredicatedTileIteratorResidualLast(
-            params,
-            pointer,
-            extent,
-            thread_id,
-            make_Coord(0, 0)) {}
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    address_iterator_.add_pointer_offset(pointer_offset);
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast& operator++() {
-    if (kAdvanceRank)
-      address_iterator_.add_tile_offset({0, 1});
-    else
-      address_iterator_.add_tile_offset({1, 0});
-    return *this;
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast operator++(int) {
-    PredicatedTileIteratorResidualLast self(*this);
-    operator++();
-    return self;
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    address_iterator_.clear_mask(enable);
-  }
-  CUTLASS_HOST_DEVICE
-  void set_residual_tile(bool enable) {
-    address_iterator_.set_residual_tile(enable);
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    address_iterator_.enable_mask();
-  }
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const& mask) {
-    address_iterator_.set_mask(mask);
-  }
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask& mask) {
-    address_iterator_.get_mask(mask);
-  }
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
-    load_with_byte_offset(
-        frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
-    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-          int idx = v +
-              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-          address_iterator_.set_iteration_index(idx);
-          char const* byte_ptr =
-              reinterpret_cast<char const*>(address_iterator_.get()) +
-              byte_offset;
-          AccessType const* access_ptr =
-              reinterpret_cast<AccessType const*>(byte_ptr);
-          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
-              frag_ptr[idx], access_ptr, address_iterator_.valid());
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment& frag) {
-    load_with_byte_offset(frag, 0);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
-    store_with_byte_offset(
-        frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
-    address_iterator_.set_iteration_index(0);
-    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-          int idx = v +
-              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-          char* byte_ptr =
-              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
-          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
-          if (address_iterator_.valid()) {
-            *access_ptr = frag_ptr[idx];
-          }
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const& frag) {
-    store_with_byte_offset(frag, 0);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-    typename Shape_,
-    typename Element_,
-    int AdvanceRank,
-    typename ThreadMap_,
-    int AccessSize,
-    bool Gather>
-class PredicatedTileIteratorResidualLast<
-    Shape_,
-    Element_,
-    layout::ColumnMajor,
-    AdvanceRank,
-    ThreadMap_,
-    AccessSize,
-    Gather> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Pointer = Element*;
-  using NonConstPointer = typename platform::remove_const<Element>::type*;
-  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
-      Element,
-      layout::PitchLinear,
-      (kAdvanceRank == 0 ? 0 : 1),
-      ThreadMap,
-      AccessSize,
-      Gather>;
-  using AccessType = typename UnderlyingIterator::AccessType;
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<
-      Element,
-      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileIteratorResidualLast;
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-   public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const& layout)
-        : params_(layout::PitchLinear(layout.stride(0))) {}
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const& base)
-        : params_(base) {}
-  };
- private:
-  //
-  // Data members
-  //
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      Params const& params, ///< Precomputed parameters object
-      Pointer pointer, ///< Pointer to start of tensor
-      TensorCoord extent, ///< Extent of tensor
-      int thread_id, ///< ID of each participating thread
-      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
-      int const* indices =
-          nullptr ///< gather/scatter indices, note no support for
-                  ///< gather/scatter at this specialization
-      )
-      : iterator_(
-            params.params_,
-            pointer,
-            layout::PitchLinearCoord(extent.row(), extent.column()),
-            thread_id,
-            layout::PitchLinearCoord(
-                threadblock_offset.row(),
-                threadblock_offset.column()),
-            indices) {}
-  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
-  /// offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      Params const& params, ///< Precomputed parameters object
-      Pointer pointer, ///< Pointer to start of tensor
-      TensorCoord extent, ///< Extent of tensor
-      int thread_id ///< ID of each participating thread
-      )
-      : PredicatedTileIteratorResidualLast(
-            params,
-            pointer,
-            extent,
-            thread_id,
-            make_Coord(0, 0)) {}
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast& operator++() {
-    ++iterator_;
-    return *this;
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast operator++(int) {
-    PredicatedTileIteratorResidualLast self(*this);
-    operator++();
-    return self;
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-  CUTLASS_HOST_DEVICE
-  void set_residual_tile(bool enable) {
-    iterator_.set_residual_tile(enable);
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const& mask) {
-    iterator_.set_mask(mask);
-  }
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask& mask) {
-    iterator_.get_mask(mask);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment& frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
-    iterator_.store_with_byte_offset(frag, byte_offset);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const& frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-    typename Shape_,
-    typename Element_,
-    int AdvanceRank,
-    typename ThreadMap_,
-    int AccessSize,
-    bool Gather>
-class PredicatedTileIteratorResidualLast<
-    Shape_,
-    Element_,
-    layout::RowMajor,
-    AdvanceRank,
-    ThreadMap_,
-    AccessSize,
-    Gather> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Pointer = Element*;
-  using NonConstPointer = typename platform::remove_const<Element>::type*;
-  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
-      Element,
-      layout::PitchLinear,
-      (kAdvanceRank == 0 ? 1 : 0),
-      ThreadMap,
-      AccessSize,
-      Gather>;
-  using AccessType = typename UnderlyingIterator::AccessType;
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<
-      Element,
-      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileIteratorResidualLast;
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-   public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const& layout)
-        : params_(layout::PitchLinear(layout.stride(0))) {}
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const& base)
-        : params_(base) {}
-  };
- private:
-  //
-  // Data members
-  //
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      Params const& params, ///< Precomputed parameters object
-      Pointer pointer, ///< Pointer to start of tensor
-      TensorCoord extent, ///< Extent of tensor
-      int thread_id, ///< ID of each participating thread
-      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
-      int const* indices = nullptr ///< Gather indices
-      )
-      : iterator_(
-            params.params_,
-            pointer,
-            layout::PitchLinearCoord(extent.column(), extent.row()),
-            thread_id,
-            layout::PitchLinearCoord(
-                threadblock_offset.column(),
-                threadblock_offset.row()),
-            indices) {}
-  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
-  /// offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      Params const& params, ///< Precomputed parameters object
-      Pointer pointer, ///< Pointer to start of tensor
-      TensorCoord extent, ///< Extent of tensor
-      int thread_id ///< ID of each participating thread
-      )
-      : PredicatedTileIteratorResidualLast(
-            params,
-            pointer,
-            extent,
-            thread_id,
-            make_Coord(0, 0)) {}
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast& operator++() {
-    ++iterator_;
-    return *this;
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast operator++(int) {
-    PredicatedTileIteratorResidualLast self(*this);
-    operator++();
-    return self;
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-  CUTLASS_HOST_DEVICE
-  void set_residual_tile(bool enable) {
-    iterator_.set_residual_tile(enable);
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const& mask) {
-    iterator_.set_mask(mask);
-  }
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask& mask) {
-    iterator_.get_mask(mask);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment& frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
-    iterator_.store_with_byte_offset(frag, byte_offset);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const& frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-/// Specialization of PredicatedTileIteratorResidualLast for affine rank-2 data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-    typename Shape_,
-    typename Element_,
-    int AdvanceRank,
-    typename ThreadMap_,
-    int AccessSize>
-class PredicatedTileIteratorResidualLast<
-    Shape_,
-    Element_,
-    layout::AffineRankN<2>,
-    AdvanceRank,
-    ThreadMap_,
-    AccessSize,
-    false> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::AffineRankN<2>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Pointer = Element*;
-  using NonConstPointer = typename platform::remove_const<Element>::type*;
-  /// Type used for internal memory accesses
-  using AccessType = AlignedArray<
-      Element,
-      AccessSize,
-      (AccessSize * sizeof_bits<Element>::value / 8)>;
-  /// Underlying iterator to compute the addresses
-  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
-      Shape,
-      Element,
-      Layout,
-      kAdvanceRank,
-      ThreadMap,
-      AccessType>;
-  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<
-      Element,
-      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename TileAccessIterator::Mask;
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   public:
-    friend PredicatedTileIteratorResidualLast;
-   private:
-    /// Parameters object
-    typename TileAccessIterator::Params params_;
-   public:
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const& layout) : params_(layout) {}
-    CUTLASS_HOST_DEVICE
-    Params() {}
-  };
- private:
-  /// Internal pointer type permits fast address arithmetic
-  using BytePointer = char*;
- private:
-  //
-  // Data members
-  //
-  /// Data member to the tile access iterator
-  TileAccessIterator address_iterator_;
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      /// Precomputed parameters object
-      Params const& params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const& threadblock_offset,
-      int const* indices =
-          nullptr ///< gather/scatter indices, note no support for
-                  ///< gather/scatter at this specialization
-      )
-      : address_iterator_(
-            params.params_,
-            pointer,
-            extent,
-            thread_id,
-            threadblock_offset) {}
-  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
-  /// offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      Params const& params, ///< Precomputed parameters object
-      Pointer pointer, ///< Pointer to start of tensor
-      TensorCoord extent, ///< Extent of tensor
-      int thread_id ///< ID of each participating thread
-      )
-      : PredicatedTileIteratorResidualLast(
-            params,
-            pointer,
-            extent,
-            thread_id,
-            make_Coord(0, 0)) {}
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    address_iterator_.add_pointer_offset(pointer_offset);
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast& operator++() {
-    if (kAdvanceRank)
-      address_iterator_.add_tile_offset(make_Coord(0, 1));
-    else
-      address_iterator_.add_tile_offset(make_Coord(1, 0));
-    return *this;
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast operator++(int) {
-    PredicatedTileIteratorResidualLast self(*this);
-    operator++();
-    return self;
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    address_iterator_.clear_mask(enable);
-  }
-  CUTLASS_HOST_DEVICE
-  void set_residual_tile(bool enable) {
-    address_iterator_.set_residual_tile(enable);
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    address_iterator_.enable_mask();
-  }
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const& mask) {
-    address_iterator_.set_mask(mask);
-  }
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask& mask) {
-    address_iterator_.get_mask(mask);
-  }
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
-    load_with_byte_offset(
-        frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
-    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-          int idx = v +
-              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-          address_iterator_.set_iteration_index(idx);
-          char const* byte_ptr =
-              reinterpret_cast<char const*>(address_iterator_.get()) +
-              byte_offset;
-          AccessType const* access_ptr =
-              reinterpret_cast<AccessType const*>(byte_ptr);
-          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
-              frag_ptr[idx], access_ptr, address_iterator_.valid());
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment& frag) {
-    load_with_byte_offset(frag, 0);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
-    store_with_byte_offset(
-        frag, pointer_offset * sizeof_bits<Element>::value / 8);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
-    address_iterator_.set_iteration_index(0);
-    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
-    CUTLASS_PRAGMA_UNROLL
-    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < kAccessesPerVector; ++v) {
-          int idx = v +
-              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
-          char* byte_ptr =
-              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
-          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
-          if (address_iterator_.valid()) {
-            *access_ptr = frag_ptr[idx];
-          }
-          ++address_iterator_;
-        }
-      }
-    }
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const& frag) {
-    store_with_byte_offset(frag, 0);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
-/// column-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-    typename Shape_,
-    typename Element_,
-    int AdvanceRank,
-    typename ThreadMap_,
-    int AccessSize>
-class PredicatedTileIteratorResidualLast<
-    Shape_,
-    Element_,
-    layout::AffineRank2ColumnMajor,
-    AdvanceRank,
-    ThreadMap_,
-    AccessSize,
-    false> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::AffineRank2ColumnMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Pointer = Element*;
-  using NonConstPointer = typename platform::remove_const<Element>::type*;
-  // Map to the underlying AffineRankN<2> layout
-  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
-      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
-      Element,
-      layout::AffineRankN<2>,
-      (kAdvanceRank == 0 ? 0 : 1),
-      ThreadMap,
-      AccessSize>;
-  using AccessType = typename UnderlyingIterator::AccessType;
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<
-      Element,
-      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileIteratorResidualLast;
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-   public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
-    /// Construct the Params object given an AffineRankN<2> tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const& layout)
-        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {}
-  };
- private:
-  //
-  // Data members
-  //
-  /// Underlying AffineRankN<2> tile iterator
-  UnderlyingIterator iterator_;
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      Params const& params, ///< Precomputed parameters object
-      Pointer pointer, ///< Pointer to start of tensor
-      TensorCoord extent, ///< Extent of tensor
-      int thread_id, ///< ID of each participating thread
-      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
-      int const* indices =
-          nullptr ///< gather/scatter indices, note no support for
-                  ///< gather/scatter at this specialization
-      )
-      : iterator_(
-            params.params_,
-            pointer,
-            layout::PitchLinearCoord(extent.row(), extent.column()),
-            thread_id,
-            layout::PitchLinearCoord(
-                threadblock_offset.row(),
-                threadblock_offset.column())) {}
-  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
-  /// offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      Params const& params, ///< Precomputed parameters object
-      Pointer pointer, ///< Pointer to start of tensor
-      TensorCoord extent, ///< Extent of tensor
-      int thread_id ///< ID of each participating thread
-      )
-      : PredicatedTileIteratorResidualLast(
-            params,
-            pointer,
-            extent,
-            thread_id,
-            make_Coord(0, 0)) {}
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast& operator++() {
-    ++iterator_;
-    return *this;
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast operator++(int) {
-    PredicatedTileIteratorResidualLast self(*this);
-    operator++();
-    return self;
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-  CUTLASS_HOST_DEVICE
-  void set_residual_tile(bool enable) {
-    iterator_.set_residual_tile(enable);
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const& mask) {
-    iterator_.set_mask(mask);
-  }
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask& mask) {
-    iterator_.get_mask(mask);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment& frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
-    iterator_.store_with_byte_offset(frag, byte_offset);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const& frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
-/// row-major data.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-    typename Shape_,
-    typename Element_,
-    int AdvanceRank,
-    typename ThreadMap_,
-    int AccessSize>
-class PredicatedTileIteratorResidualLast<
-    Shape_,
-    Element_,
-    layout::AffineRank2RowMajor,
-    AdvanceRank,
-    ThreadMap_,
-    AccessSize,
-    false> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-  using Shape = Shape_;
-  using Element = Element_;
-  using Layout = layout::AffineRank2RowMajor;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Pointer = Element*;
-  using NonConstPointer = typename platform::remove_const<Element>::type*;
-  // Map to the underlying AffineRankN<2> layout
-  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
-      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
-      Element,
-      layout::AffineRankN<2>,
-      (kAdvanceRank == 0 ? 1 : 0),
-      ThreadMap,
-      AccessSize>;
-  using AccessType = typename UnderlyingIterator::AccessType;
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<
-      Element,
-      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileIteratorResidualLast;
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-   public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
-    /// Construct the Params object given an AffineRankN<2> tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const& layout)
-        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}
-  };
- private:
-  //
-  // Data members
-  //
-  /// Underlying AffineRankN<2> tile iterator
-  UnderlyingIterator iterator_;
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      Params const& params, ///< Precomputed parameters object
-      Pointer pointer, ///< Pointer to start of tensor
-      TensorCoord extent, ///< Extent of tensor
-      int thread_id, ///< ID of each participating thread
-      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
-      int const* indices =
-          nullptr ///< gather/scatter indices, note no support for
-                  ///< gather/scatter at this specialization
-      )
-      : iterator_(
-            params.params_,
-            pointer,
-            layout::PitchLinearCoord(extent.column(), extent.row()),
-            thread_id,
-            layout::PitchLinearCoord(
-                threadblock_offset.column(),
-                threadblock_offset.row())) {}
-  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
-  /// offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      Params const& params, ///< Precomputed parameters object
-      Pointer pointer, ///< Pointer to start of tensor
-      TensorCoord extent, ///< Extent of tensor
-      int thread_id ///< ID of each participating thread
-      )
-      : PredicatedTileIteratorResidualLast(
-            params,
-            pointer,
-            extent,
-            thread_id,
-            make_Coord(0, 0)) {}
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast& operator++() {
-    ++iterator_;
-    return *this;
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast operator++(int) {
-    PredicatedTileIteratorResidualLast self(*this);
-    operator++();
-    return self;
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-  CUTLASS_HOST_DEVICE
-  void set_residual_tile(bool enable) {
-    iterator_.set_residual_tile(enable);
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const& mask) {
-    iterator_.set_mask(mask);
-  }
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask& mask) {
-    iterator_.get_mask(mask);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
-    iterator_.load_with_byte_offset(frag, byte_offset);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment& frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
-    iterator_.store_with_byte_offset(frag, byte_offset);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const& frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-/// Specialization of PredicatedTileIteratorResidualLast for interleaved data.
-/// It is mapped to the congruous layout.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-    typename Shape_,
-    typename Element_,
-    int AdvanceRank,
-    typename ThreadMap_,
-    int AccessSize,
-    int InterleavedK>
-class PredicatedTileIteratorResidualLast<
-    Shape_,
-    Element_,
-    layout::ColumnMajorInterleaved<InterleavedK>,
-    AdvanceRank,
-    ThreadMap_,
-    AccessSize,
-    false> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-  using Shape = Shape_;
-  using Element = Element_;
-  static int const kInterleavedK = InterleavedK;
-  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Pointer = Element*;
-  using NonConstPointer = typename platform::remove_const<Element>::type*;
-  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
-      layout::PitchLinearShape<
-          Shape::kRow * kInterleavedK,
-          Shape::kColumn / kInterleavedK>,
-      Element,
-      layout::PitchLinear,
-      (kAdvanceRank == 0 ? 0 : 1),
-      ThreadMap,
-      AccessSize>;
-  using AccessType = typename UnderlyingIterator::AccessType;
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<
-      Element,
-      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileIteratorResidualLast;
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-   public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const& layout)
-        : params_(layout::PitchLinear(layout.stride(0))) {}
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const& base)
-        : params_(base) {}
-  };
- private:
-  //
-  // Data members
-  //
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      /// Precomputed parameters object
-      Params const& params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const& threadblock_offset,
-      int const* indices =
-          nullptr ///< gather/scatter indices, note no support for
-                  ///< gather/scatter at this specialization
-      )
-      : iterator_(
-            params.params_,
-            pointer,
-            layout::PitchLinearCoord(
-                extent.row() * kInterleavedK,
-                extent.column() / kInterleavedK),
-            thread_id,
-            layout::PitchLinearCoord(
-                threadblock_offset.row() * kInterleavedK,
-                threadblock_offset.column() / kInterleavedK)) {}
-  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
-  /// offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      Params const& params, ///< Precomputed parameters object
-      Pointer pointer, ///< Pointer to start of tensor
-      TensorCoord extent, ///< Extent of tensor
-      int thread_id ///< ID of each participating thread
-      )
-      : PredicatedTileIteratorResidualLast(
-            params,
-            pointer,
-            extent,
-            thread_id,
-            make_Coord(0, 0)) {}
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast& operator++() {
-    ++iterator_;
-    return *this;
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast operator++(int) {
-    PredicatedTileIteratorResidualLast self(*this);
-    operator++();
-    return self;
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-  CUTLASS_HOST_DEVICE
-  void set_residual_tile(bool enable) {
-    iterator_.set_residual_tile(enable);
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const& mask) {
-    iterator_.set_mask(mask);
-  }
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask& mask) {
-    iterator_.get_mask(mask);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment& frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const& frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-/// Specialization of PredicatedTileIteratorResidualLast for interleaved-32
-/// data.  It is mapped to the congruous layout.
-///
-/// Satisfies: ForwardTileIteratorConcept |
-///            ReadableContiguousTileIteratorConcept |
-///            WriteableContiguousTileIteratorConcept |
-///            MaskedTileIteratorConcept
-///
-template <
-    typename Shape_,
-    typename Element_,
-    int AdvanceRank,
-    typename ThreadMap_,
-    int AccessSize,
-    int InterleavedK>
-class PredicatedTileIteratorResidualLast<
-    Shape_,
-    Element_,
-    layout::RowMajorInterleaved<InterleavedK>,
-    AdvanceRank,
-    ThreadMap_,
-    AccessSize,
-    false> {
- public:
-  static_assert(
-      AdvanceRank == 0 || AdvanceRank == 1,
-      "Specialization for pitch-linear iterator may along advance along the "
-      "contiguous(rank=0) or strided(rank=1) dimension.");
-  using Shape = Shape_;
-  using Element = Element_;
-  static int const kInterleavedK = InterleavedK;
-  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
-  static int const kAdvanceRank = AdvanceRank;
-  using ThreadMap = ThreadMap_;
-  using Index = typename Layout::Index;
-  using LongIndex = typename Layout::LongIndex;
-  using TensorRef = TensorRef<Element, Layout>;
-  using TensorView = TensorView<Element, Layout>;
-  using TensorCoord = typename Layout::TensorCoord;
-  using Pointer = Element*;
-  using NonConstPointer = typename platform::remove_const<Element>::type*;
-  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
-      layout::PitchLinearShape<
-          Shape::kColumn * kInterleavedK,
-          Shape::kRow / kInterleavedK>,
-      Element,
-      layout::PitchLinear,
-      (kAdvanceRank == 0 ? 1 : 0),
-      ThreadMap,
-      AccessSize>;
-  using AccessType = typename UnderlyingIterator::AccessType;
-  /// Fragment object to be loaded or stored
-  using Fragment = cutlass::Array<
-      Element,
-      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
-  /// Predicate vector stores mask to guard accesses
-  using Mask = typename UnderlyingIterator::Mask;
-  /// Parameters object is precomputed state and is host-constructible
-  class Params {
-   private:
-    friend PredicatedTileIteratorResidualLast;
-    /// Parameters object
-    typename UnderlyingIterator::Params params_;
-   public:
-    CUTLASS_HOST_DEVICE
-    Params() {}
-    /// Construct the Params object given a pitch-linear tensor's layout
-    CUTLASS_HOST_DEVICE
-    Params(Layout const& layout)
-        : params_(layout::PitchLinear(layout.stride(0))) {}
-    CUTLASS_HOST_DEVICE
-    Params(typename UnderlyingIterator::Params::Base const& base)
-        : params_(base) {}
-  };
- private:
-  //
-  // Data members
-  //
-  /// Underlying pitch-linear tile iterator
-  UnderlyingIterator iterator_;
- public:
-  /// Constructs a TileIterator from its precomputed state, threadblock offset,
-  /// and thread ID
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      /// Precomputed parameters object
-      Params const& params,
-      /// Pointer to start of tensor
-      Pointer pointer,
-      /// Extent of tensor
-      TensorCoord extent,
-      /// ID of each participating thread
-      int thread_id,
-      /// Initial offset of threadblock
-      TensorCoord const& threadblock_offset,
-      int const* indices =
-          nullptr ///< gather/scatter indices, note no support for
-                  ///< gather/scatter at this specialization
-      )
-      : iterator_(
-            params.params_,
-            pointer,
-            layout::PitchLinearCoord(
-                extent.column() * kInterleavedK,
-                extent.row() / kInterleavedK),
-            thread_id,
-            layout::PitchLinearCoord(
-                threadblock_offset.column() * kInterleavedK,
-                threadblock_offset.row() / kInterleavedK)) {}
-  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
-  /// offset
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast(
-      Params const& params, ///< Precomputed parameters object
-      Pointer pointer, ///< Pointer to start of tensor
-      TensorCoord extent, ///< Extent of tensor
-      int thread_id ///< ID of each participating thread
-      )
-      : PredicatedTileIteratorResidualLast(
-            params,
-            pointer,
-            extent,
-            thread_id,
-            make_Coord(0, 0)) {}
-  /// Adds a pointer offset in units of Element
-  CUTLASS_HOST_DEVICE
-  void add_pointer_offset(LongIndex pointer_offset) {
-    iterator_.add_pointer_offset(pointer_offset);
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast& operator++() {
-    ++iterator_;
-    return *this;
-  }
-  /// Advances to the next tile in memory.
-  ///
-  /// The first time this method is called, predicates are updated, and the
-  /// iterator's internal pointer is reverted to the first "steady state" tile.
-  /// Subsequent calls are lightweight and must only update the internal
-  /// pointer.
-  CUTLASS_HOST_DEVICE
-  PredicatedTileIteratorResidualLast operator++(int) {
-    PredicatedTileIteratorResidualLast self(*this);
-    operator++();
-    return self;
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void clear_mask(bool enable = true) {
-    iterator_.clear_mask(enable);
-  }
-  CUTLASS_HOST_DEVICE
-  void set_residual_tile(bool enable) {
-    iterator_.set_residual_tile(enable);
-  }
-  /// Clears the predicate set efficiently
-  CUTLASS_HOST_DEVICE
-  void enable_mask() {
-    iterator_.enable_mask();
-  }
-  /// Sets the predicate mask, overriding value stored in predicate iterator
-  CUTLASS_HOST_DEVICE
-  void set_mask(Mask const& mask) {
-    iterator_.set_mask(mask);
-  }
-  /// Gets the mask
-  CUTLASS_HOST_DEVICE
-  void get_mask(Mask& mask) {
-    iterator_.get_mask(mask);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
-    iterator_.load_with_pointer_offset(frag, pointer_offset);
-  }
-  /// Loads a fragment from memory
-  CUTLASS_DEVICE
-  void load(Fragment& frag) {
-    load_with_pointer_offset(frag, 0);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
-    iterator_.store_with_pointer_offset(frag, pointer_offset);
-  }
-  /// Store a fragment to memory
-  CUTLASS_DEVICE
-  void store(Fragment const& frag) {
-    store_with_pointer_offset(frag, 0);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-} // namespace threadblock
-} // namespace transform
-} // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/iterators/transpose_warp_iterator.h DELETED Viewed

@@ -1,55 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "warp_iterator_from_smem.h"
-template <typename WarpIterator>
-struct TransposeWarpIterator {
-  using Iterator = char;
-  static bool constexpr kSupportsTranspose = false;
-};
-template <
-    /// Operand identity
-    cutlass::gemm::Operand Operand,
-    /// Data type of A elements
-    typename Element,
-    typename InstructionShape,
-    bool kTranspose>
-struct TransposeWarpIterator<
-    cutlass::gemm::warp::
-        WarpIteratorFromSmem<Operand, Element, InstructionShape, kTranspose>> {
-  using Iterator = cutlass::gemm::warp::
-      WarpIteratorFromSmem<Operand, Element, InstructionShape, !kTranspose>;
-  static bool constexpr kSupportsTranspose = true;
-};

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/iterators/warp_iterator_from_smem.h DELETED Viewed

@@ -1,283 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Inspired from
-   "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h" Loads tiles of GEMM
-   operands from a RowMajor shared-memory layout into registers to use by A100
-   TensorCores.
-    The difference with "mma_tensor_op_tile_access_iterator.h" is that:
-    (1) We use "ldmatrix" to load tiles, rather than manual loads (slightly
-   faster) (2) We support to transpose the operand (eg read `A.transpose()` when
-   the shared memory holds `A`)
-    This is only implemented for the specific shapes.
-*/
-#pragma once
-#include <cutlass/gemm/gemm.h>
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace gemm {
-namespace warp {
-template <
-    /// Operand identity
-    Operand Operand_,
-    /// Data type of A elements
-    typename Element_,
-    typename InstructionShape_,
-    bool kTranspose = false>
-class WarpIteratorFromSmem {
- public:
-  /// Shape of tile to load (concept: MatrixShape)
-  using Shape = cutlass::MatrixShape<32, 32>;
-  /// Operand tag
-  static Operand const kOperand = Operand_;
-  static_assert(
-      kOperand == Operand::kA,
-      "No support for OperandB at the moment");
-  /// Basic check
-  static_assert(
-      kOperand == Operand::kA || kOperand == Operand::kB,
-      "WarpIteratorFromSmem may only be instantiated for A or B operands to warp-level Mma.");
-  /// Element type
-  using Element = Element_;
-  static_assert(sizeof_bits<Element>::value == 16, "Only supported for half");
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajor;
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-  static_assert(InstructionShape::kRow == 16, "Only supports 16x8x8 / 16x8x16");
-  static_assert(
-      InstructionShape::kColumn == 8 || InstructionShape::kColumn == 16,
-      "Only supports 16x8x8 / 16x8x16");
-  /// Delta between *MMA operations (in units of *MMA operations, concept:
-  /// MatrixShape)
-  static int const kOpDelta = 1;
-  /// Number of participating threads
-  static int const kThreads = 32;
-  /// TensorRef type for loading element from a tensor
-  using TensorRef = TensorRef<Element, Layout>;
-  /// Index type
-  using Index = typename TensorRef::Index;
-  /// Long Index type
-  using LongIndex = typename TensorRef::LongIndex;
-  /// Coordinate for an element in the tensor
-  using TensorCoord = typename TensorRef::TensorCoord;
-  /// Number of elements accessed per Shared Memory load
-  static int const kElementsPerAccess =
-      (sizeof_bits<Element>::value >= 32 ? 1
-                                         : 32 / sizeof_bits<Element>::value);
-  using InstructionCount = MatrixShape<
-      Shape::kRow / InstructionShape::kRow,
-      Shape::kColumn / InstructionShape::kColumn>;
-  static int const kIterations = (kOperand == Operand::kA)
-      ? InstructionCount::kColumn
-      : InstructionCount::kRow;
- public:
-  //
-  // Derived quantities
-  //
-  /// Fragment object holding a thread's part of a tile
-  using Fragment = Array<
-      Element,
-      (kOperand == Operand::kA)
-          ? (Shape::kRow* InstructionShape::kColumn / kThreads)
-          : (Shape::kColumn* InstructionShape::kRow / kThreads)>;
-  /// Memory access type
-  // using AccessType = AlignedArray<Element, kElementsPerAccess>;
-  using AccessType = Array<unsigned, 4>;
-  static int constexpr kWarpShapeDivisibleInner =
-      (kOperand == Operand::kA ? InstructionShape::kColumn
-                               : InstructionShape::kRow);
-  static int constexpr kAccessesInner =
-      (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
-  // Number of 32bits tiles to load per `ldmatrix`
-  static int const kTilesPerInstruction = InstructionShape::kRow / 8;
-  static_assert(kTilesPerInstruction == 2, "Only supports 16x8x16 and 16x8x8");
- private:
-  /// Underlying tensor reference
-  TensorRef ref_;
-  /// Origin
-  MatrixCoord origin_;
-  /// Iterations in a tile
-  int iterations_;
- public:
-  /// Constructor from TensorRef
-  CUTLASS_HOST_DEVICE
-  WarpIteratorFromSmem(TensorRef const& ref, int lane_id)
-      : WarpIteratorFromSmem(ref, {Shape::kRow, Shape::kColumn}, lane_id) {}
-  CUTLASS_HOST_DEVICE
-  WarpIteratorFromSmem(TensorRef const& ref, TensorCoord extent, int lane_id)
-      : ref_(ref), iterations_(0) {
-    // See also:
-    // https://docs.nvidia.com/cuda/archive/11.7.1/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-1688
-    // 16x8x8: kAccessesInner = 1 (1 ldmatrix.x4)
-    // 16x8x16: kAccessesInner = 2 (2 ldmatrix.x4)
-    int ldsm_vec_num = (lane_id >> 3);
-    if (kOperand == Operand::kA) {
-      origin_ = MatrixCoord(lane_id % 8, 0);
-      static_assert(
-          InstructionCount::kRow * kTilesPerInstruction == 4,
-          "can't use ldmatrix.x4");
-      int access_m_idx = ldsm_vec_num % kTilesPerInstruction;
-      int inner_idx = (ldsm_vec_num / kTilesPerInstruction) % kAccessesInner;
-      int inst_m_idx = ldsm_vec_num / (kTilesPerInstruction * kAccessesInner);
-      MatrixCoord offset(
-          access_m_idx * 8 + inst_m_idx * InstructionShape::kRow,
-          inner_idx * 4 * kElementsPerAccess);
-      if (kTranspose) {
-        offset = MatrixCoord(offset.column(), offset.row());
-      }
-      origin_ += offset;
-    } else {
-      // Note: This is not tested or used
-      origin_ = MatrixCoord(0, lane_id % 8);
-      static_assert(InstructionCount::kColumn * kAccessesInner == 4, "");
-      CUTLASS_PRAGMA_UNROLL
-      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn;
-           ++inst_n_idx) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
-          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
-          MatrixCoord offset(
-              inner_idx * 4 * kElementsPerAccess, inst_n_idx * 8);
-          if (access_idx == ldsm_vec_num) {
-            if (kTranspose) {
-              offset = MatrixCoord(offset.column(), offset.row());
-            }
-            origin_ += offset;
-          }
-        }
-      }
-    }
-    ref_.add_coord_offset(origin_);
-  }
-  /// Advances an iterator along logical dimensions of matrix in units of whole
-  /// tiles
-  CUTLASS_HOST_DEVICE
-  WarpIteratorFromSmem& add_tile_offset(TensorCoord const& tile_offset) {
-    TensorCoord coord_offset(
-        tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
-    if (kTranspose) {
-      coord_offset = TensorCoord{coord_offset.column(), coord_offset.row()};
-    }
-    origin_ += coord_offset;
-    ref_.add_coord_offset(coord_offset);
-    return *this;
-  }
-  /// Advances the iterator along the advance dimension
-  CUTLASS_DEVICE
-  void advance() {
-    if (kOperand == Operand::kA) {
-      add_tile_offset({0, 1});
-    } else {
-      add_tile_offset({1, 0});
-    }
-    iterations_ = 0;
-  }
-  /// increase iterations in a tile
-  CUTLASS_HOST_DEVICE
-  WarpIteratorFromSmem& operator++() {
-    iterations_++;
-    if (iterations_ >= kIterations)
-      advance();
-    return *this;
-  }
-  /// Loads a fragment from memory at the location pointed to by the iterator.
-  CUTLASS_DEVICE
-  void load(Fragment& frag) const {
-    AccessType* access_ptr = reinterpret_cast<AccessType*>(&frag);
-    using LoadLayout = typename platform::
-        conditional<kTranspose, layout::ColumnMajor, layout::RowMajor>::type;
-    CUTLASS_PRAGMA_UNROLL
-    for (int access_m_idx = 0; access_m_idx <
-         (InstructionCount::kRow * kTilesPerInstruction * kAccessesInner) / 4;
-         ++access_m_idx) {
-      MatrixCoord offset;
-      if (kOperand == Operand::kA) {
-        offset = MatrixCoord(
-            access_m_idx * 16, iterations_ * InstructionShape::kColumn);
-      } else {
-        offset = MatrixCoord(iterations_ * InstructionShape::kRow, 0);
-      }
-      if (kTranspose) {
-        offset = MatrixCoord(offset.column(), offset.row());
-      }
-      cutlass::arch::ldsm<LoadLayout, 4>(
-          access_ptr[access_m_idx], ref_.data() + ref_.offset(offset));
-    }
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/kernel_backward.h DELETED Viewed

The diff for this file is too large to render. See raw diff

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/kernel_forward.h DELETED Viewed

@@ -1,1322 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#ifdef HAS_PYTORCH
-#include <ATen/cuda/CUDAGeneratorImpl.h>
-#include <ATen/cuda/CUDAGraphsUtils.cuh>
-#endif
-#include <curand_kernel.h>
-#include <cmath>
-#include <cinttypes>
-#include <vector>
-#include "cutlass/fast_math.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/matrix.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/platform/platform.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-#include "debug_utils.h"
-#include "epilogue/epilogue_pipelined.h"
-#include "epilogue/epilogue_rescale_output.h"
-#include "gemm/custom_mma.h"
-#include "gemm/find_default_mma.h"
-#include "gemm/mma_from_smem.h"
-#include "gemm_kernel_utils.h"
-#include "transform/tile_smem_loader.h"
-using namespace gemm_kernel_utils;
-namespace {
-template <typename scalar_t, typename Arch>
-constexpr int getWarpsPerSmFw() {
-  return (
-      Arch::kMinComputeCapability >= 80 &&
-              !cutlass::platform::is_same<scalar_t, float>::value
-          ? 16
-          : 12);
-}
-static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value) {
-  // source: https://stackoverflow.com/a/51549250
-  return (value >= 0)
-      ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
-      : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
-}
-} // namespace
-// If ToBatchHookType_ is supplied other than this default (which is
-// never the case in the xformers library) then the user is
-// defining the logic which each block uses to find its data to work on,
-// with the advance_to_batch function with the following signature.
-// It should return false if there is no work to do for this block.
-// In general this will not work with saving for backward due to fixed layout
-// for logsumexp and incompatible rngs for dropout, so is likely only useful for
-// custom inference.
-struct DefaultToBatchHook {
-  template <typename Params>
-  CUTLASS_DEVICE static bool advance_to_batch(
-      Params&,
-      int64_t& /* q_start */,
-      int64_t& /* k_start */) {
-    return true;
-  }
-};
-template <
-    // The datatype of Q/K/V
-    typename scalar_t_,
-    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
-    typename ArchTag,
-    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
-    bool isAligned_,
-    int kQueriesPerBlock_,
-    int kKeysPerBlock_,
-    // upperbound on `max(value.shape[-1], query.shape[-1])`
-    int kMaxK_ = (int)cutlass::platform::numeric_limits<uint32_t>::max(),
-    // This is quite slower on V100 for some reason
-    // Set to false if you know at compile-time you will never need dropout
-    bool kSupportsDropout_ = true,
-    bool kSupportsBias_ = true,
-    typename ToBatchHookType_ = DefaultToBatchHook>
-struct AttentionKernel {
-  enum CustomMaskType {
-    NoCustomMask = 0,
-    CausalFromTopLeft = 1,
-    CausalFromBottomRight = 2,
-    NumCustomMaskTypes,
-  };
-  using scalar_t = scalar_t_;
-  using accum_t = float;
-  using lse_scalar_t = float;
-  using output_t = scalar_t;
-  // Accumulator between 2 iterations
-  // Using `accum_t` improves perf on f16 at the cost of
-  // numerical errors
-  using output_accum_t = accum_t;
-  static constexpr bool kSupportsDropout = kSupportsDropout_;
-  static constexpr bool kSupportsBias = kSupportsBias_;
-  static constexpr int kKeysPerBlock = kKeysPerBlock_;
-  static constexpr int kQueriesPerBlock = kQueriesPerBlock_;
-  static constexpr int kMaxK = kMaxK_;
-  static constexpr bool kIsAligned = isAligned_;
-  static constexpr bool kSingleValueIteration = kMaxK <= kKeysPerBlock;
-  static constexpr int32_t kAlignLSE = 32; // block size of backward
-  static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value == 16;
-  static constexpr bool kPreloadV =
-      ArchTag::kMinComputeCapability >= 80 && kIsHalf;
-  static constexpr bool kKeepOutputInRF = kSingleValueIteration;
-  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
-      !cutlass::platform::is_same<output_accum_t, output_t>::value;
-  static_assert(kQueriesPerBlock % 32 == 0, "");
-  static_assert(kKeysPerBlock % 32 == 0, "");
-  static constexpr int kNumWarpsPerBlock =
-      kQueriesPerBlock * kKeysPerBlock / (32 * 32);
-  static constexpr int kWarpSize = 32;
-  // Launch bounds
-  static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock;
-  static constexpr int kMinBlocksPerSm =
-      getWarpsPerSmFw<scalar_t, ArchTag>() / kNumWarpsPerBlock;
-  struct Params {
-    // Input tensors
-    scalar_t* query_ptr = nullptr; // [num_queries, num_heads, head_dim]
-    scalar_t* key_ptr = nullptr; // [num_keys, num_heads, head_dim]
-    scalar_t* value_ptr = nullptr; // [num_keys, num_heads, head_dim_value]
-    scalar_t* attn_bias_ptr = nullptr; // [num_heads, num_queries, num_keys]
-    int32_t* seqstart_q_ptr = nullptr;
-    int32_t* seqstart_k_ptr = nullptr;
-    int32_t* seqlen_k_ptr = nullptr;
-    uint32_t causal_diagonal_offset = 0;
-    // Output tensors
-    output_t* output_ptr = nullptr; // [num_queries, num_heads, head_dim_value]
-    // [num_queries, num_heads, head_dim_value]
-    output_accum_t* output_accum_ptr = nullptr;
-    // [num_heads, num_queries] - can be null
-    lse_scalar_t* logsumexp_ptr = nullptr;
-    // Scale
-    accum_t scale = 0.0;
-    // Dimensions/strides
-    int32_t head_dim = 0;
-    int32_t head_dim_value = 0;
-    int32_t num_queries = 0;
-    int32_t num_keys = 0;
-    int32_t num_keys_absolute = 0;
-    uint8_t custom_mask_type = NoCustomMask;
-    int32_t q_strideM = 0;
-    int32_t k_strideM = 0;
-    int32_t v_strideM = 0;
-    int32_t bias_strideM = 0;
-    int32_t o_strideM = 0;
-    // Everything below is only used in `advance_to_block`
-    // and shouldn't use registers
-    int32_t q_strideH = 0;
-    int32_t k_strideH = 0;
-    int32_t v_strideH = 0;
-    int64_t bias_strideH = 0;
-    int64_t q_strideB = 0;
-    int64_t k_strideB = 0;
-    int64_t v_strideB = 0;
-    int64_t bias_strideB = 0;
-    int32_t num_batches = 0;
-    int32_t num_heads = 0;
-    // dropout
-    bool use_dropout = false;
-    unsigned long long dropout_batch_head_rng_offset = 0;
-    float dropout_prob = 0.0f;
-#ifdef HAS_PYTORCH
-    at::PhiloxCudaState rng_engine_inputs = at::PhiloxCudaState(0, 0);
-#endif
-    // Moves pointers to what we should process
-    // Returns "false" if there is no work to do
-    CUTLASS_DEVICE bool advance_to_block() {
-      auto batch_id = blockIdx.z;
-      auto head_id = blockIdx.y;
-      auto query_start = blockIdx.x * kQueriesPerBlock;
-      auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE;
-      if (kSupportsDropout) {
-        dropout_batch_head_rng_offset =
-            batch_id * num_heads * num_queries * num_keys +
-            head_id * num_queries * num_keys;
-      }
-      int64_t q_start = 0, k_start = 0;
-      // Advance to current batch - in case of different sequence lengths
-      constexpr bool kToBatchHook =
-          !cutlass::platform::is_same<ToBatchHookType_, DefaultToBatchHook>::
-              value;
-      if (kToBatchHook) {
-        // Call out to a custom implementation.
-        if (!ToBatchHookType_::advance_to_batch(*this, q_start, k_start)) {
-          return false;
-        }
-      } else if (seqstart_q_ptr != nullptr) {
-        assert(seqstart_k_ptr != nullptr);
-        seqstart_q_ptr += batch_id;
-        q_start = seqstart_q_ptr[0];
-        int64_t q_next_start = seqstart_q_ptr[1];
-        int64_t k_end;
-        seqstart_k_ptr += batch_id;
-        if (seqlen_k_ptr) {
-          k_start = seqstart_k_ptr[0];
-          k_end = k_start + seqlen_k_ptr[batch_id];
-        } else {
-          k_start = seqstart_k_ptr[0];
-          k_end = seqstart_k_ptr[1];
-        }
-        num_queries = q_next_start - q_start;
-        num_keys = k_end - k_start;
-        if (query_start >= num_queries) {
-          return false;
-        }
-      } else {
-        query_ptr += batch_id * q_strideB;
-        key_ptr += batch_id * k_strideB;
-        value_ptr += batch_id * v_strideB;
-        output_ptr += int64_t(batch_id * num_queries) * o_strideM;
-        if (output_accum_ptr != nullptr) {
-          output_accum_ptr +=
-              int64_t(batch_id * num_queries) * (head_dim_value * num_heads);
-        }
-        q_start = 0;
-        k_start = 0;
-      }
-      // Advance to the current batch / head / query_start
-      query_ptr += (q_start + query_start) * q_strideM + head_id * q_strideH;
-      key_ptr += k_start * k_strideM + head_id * k_strideH;
-      value_ptr += k_start * v_strideM + head_id * v_strideH;
-      output_ptr +=
-          int64_t(q_start + query_start) * o_strideM + head_id * head_dim_value;
-      if (kSupportsBias && attn_bias_ptr != nullptr) {
-        attn_bias_ptr += (batch_id * bias_strideB) + (head_id * bias_strideH);
-      }
-      if (output_accum_ptr != nullptr) {
-        output_accum_ptr +=
-            int64_t(q_start + query_start) * (head_dim_value * num_heads) +
-            head_id * head_dim_value;
-      } else {
-        // Accumulate directly in the destination buffer (eg for f32)
-        output_accum_ptr = (accum_t*)output_ptr;
-      }
-      if (logsumexp_ptr != nullptr) {
-        // lse[batch_id, head_id, query_start]
-        logsumexp_ptr +=
-            batch_id * lse_dim * num_heads + head_id * lse_dim + query_start;
-      }
-      // Custom masking
-      if (custom_mask_type == CausalFromBottomRight) {
-        causal_diagonal_offset = num_keys - num_queries;
-      }
-      // We use num_keys_absolute to index into the rng_state
-      // We need this index to match between forward and backwards
-      num_keys_absolute = num_keys;
-      if (custom_mask_type == CausalFromTopLeft ||
-          custom_mask_type == CausalFromBottomRight) {
-        // the bottom row of the current block is query_start + kQueriesPerBlock
-        // the last active key is then query_start + causal_diagonal_offset +
-        // kQueriesPerBlock so num_keys is the min between actual num_keys and
-        // this to avoid extra computations
-        num_keys = cutlass::fast_min(
-            int32_t(query_start + causal_diagonal_offset + kQueriesPerBlock),
-            num_keys);
-      }
-      num_queries -= query_start;
-      num_batches = 0; // no longer used after
-      // If num_queries == 1, and there is only one key head we're wasting
-      // 15/16th of tensor core compute In that case :
-      //  - we only launch kernels for head_id % kQueriesPerBlock == 0
-      //  - we iterate over heads instead of queries (strideM = strideH)
-      if (num_queries == 1 && k_strideH == 0 && v_strideH == 0) {
-        if (head_id % kQueriesPerBlock != 0)
-          return false;
-        q_strideM = q_strideH;
-        num_queries = num_heads;
-        num_heads = 1; // unused but here for intent
-        // remove causal since n_query = 1
-        // otherwise, offset would change with head !
-        custom_mask_type = NoCustomMask;
-        o_strideM = head_dim_value;
-      }
-      // Make sure the compiler knows these variables are the same on all
-      // the threads of the warp.
-      // Only worth doing if they could have been modified above.
-      query_ptr = warp_uniform(query_ptr);
-      key_ptr = warp_uniform(key_ptr);
-      value_ptr = warp_uniform(value_ptr);
-      if (kSupportsBias) {
-        attn_bias_ptr = warp_uniform(attn_bias_ptr);
-      }
-      output_ptr = warp_uniform(output_ptr);
-      output_accum_ptr = warp_uniform(output_accum_ptr);
-      logsumexp_ptr = warp_uniform(logsumexp_ptr);
-      num_queries = warp_uniform(num_queries);
-      num_keys = warp_uniform(num_keys);
-      num_heads = warp_uniform(num_heads);
-      o_strideM = warp_uniform(o_strideM);
-      custom_mask_type = warp_uniform(custom_mask_type);
-      return true;
-    }
-    __host__ dim3 getBlocksGrid() const {
-      return dim3(
-          ceil_div(num_queries, (int32_t)kQueriesPerBlock),
-          num_heads,
-          num_batches);
-    }
-    __host__ dim3 getThreadsGrid() const {
-      return dim3(kWarpSize, kNumWarpsPerBlock, 1);
-    }
-  };
-  struct MM0 {
-    /*
-      In this first matmul, we compute a block of `Q @ K.T`.
-      While the calculation result is still hot in registers, we update
-      `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
-      into a shared-memory ("AccumulatorSharedStorage") that is used later as
-      operand A for the second matmul (see MM1)
-    */
-    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
-    using OpClass = typename GemmType::OpClass;
-    using DefaultConfig =
-        typename cutlass::gemm::device::DefaultGemmConfiguration<
-            OpClass,
-            ArchTag,
-            scalar_t,
-            scalar_t,
-            scalar_t, // ElementC
-            accum_t // ElementAccumulator
-            >;
-    static constexpr int kAlignmentA =
-        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
-    static constexpr int kAlignmentB =
-        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
-    using ThreadblockShape = cutlass::gemm::
-        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
-    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
-    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
-        scalar_t, // ElementA,
-        cutlass::layout::RowMajor, // LayoutA,
-        kAlignmentA,
-        scalar_t, // ElementB,
-        cutlass::layout::ColumnMajor, // LayoutB,
-        kAlignmentB,
-        accum_t,
-        cutlass::layout::RowMajor, // LayoutC,
-        OpClass,
-        ArchTag, // ArchTag
-        ThreadblockShape, // ThreadblockShape
-        WarpShape, // WarpShape
-        typename GemmType::InstructionShape, // InstructionShape
-        ArchTag::kMinComputeCapability >= 80 && kIsHalf
-            ? 4
-            : DefaultConfig::kStages,
-        typename GemmType::Operator // Operator
-        >::DefaultMma;
-    using MmaCore = typename DefaultMma::MmaCore;
-    using IteratorA = typename DefaultMma::IteratorA;
-    using IteratorB = typename DefaultMma::IteratorB;
-    using DefaultThreadblockMma = typename DefaultMma::ThreadblockMma;
-    using Mma = typename cutlass::platform::conditional<
-        kSingleValueIteration,
-        typename MakeCustomMma<DefaultThreadblockMma, kMaxK>::Mma,
-        DefaultThreadblockMma>::type;
-    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
-        typename Mma::Operator::IteratorC,
-        accum_t,
-        kWarpSize>::Iterator;
-    static_assert(
-        MmaCore::WarpCount::kM * MmaCore::WarpCount::kN *
-                MmaCore::WarpCount::kK ==
-            kNumWarpsPerBlock,
-        "");
-    // used for efficient load of bias tile Bij from global to shared memory
-    using BiasLoader = TileSmemLoader<
-        scalar_t,
-        cutlass::MatrixShape<kQueriesPerBlock, kKeysPerBlock>,
-        MmaCore::kThreads,
-        // input restriction: kv_len has to be a multiple of this value
-        128 / cutlass::sizeof_bits<scalar_t>::value>;
-    // Epilogue to store to shared-memory in a format that we can use later for
-    // the second matmul
-    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
-        typename Mma::Operator::IteratorC,
-        typename Mma::Operator,
-        scalar_t,
-        WarpShape,
-        ThreadblockShape>;
-    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
-  };
-  struct MM1 {
-    /**
-      Second matmul: perform `attn @ V` where `attn` is the attention (not
-      normalized) and stored in shared memory
-    */
-    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
-    using OpClass = typename GemmType::OpClass;
-    using DefaultConfig =
-        typename cutlass::gemm::device::DefaultGemmConfiguration<
-            OpClass,
-            ArchTag,
-            scalar_t,
-            scalar_t,
-            output_accum_t, // ElementC
-            accum_t // ElementAccumulator
-            >;
-    static constexpr int kAlignmentA = DefaultConfig::kAlignmentA; // from smem
-    static constexpr int kAlignmentB =
-        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
-    using ThreadblockShape = cutlass::gemm::
-        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
-    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
-    using InstructionShape = typename GemmType::InstructionShape;
-    using LayoutB = cutlass::layout::RowMajor;
-    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
-        scalar_t, // ElementA,
-        cutlass::layout::RowMajor, // LayoutA,
-        kAlignmentA,
-        scalar_t, // ElementB,
-        LayoutB, // LayoutB,
-        kAlignmentB,
-        output_accum_t,
-        cutlass::layout::RowMajor, // LayoutC,
-        accum_t,
-        OpClass,
-        ArchTag,
-        ThreadblockShape,
-        WarpShape,
-        typename GemmType::InstructionShape,
-        typename DefaultConfig::EpilogueOutputOp,
-        void, // ThreadblockSwizzle - not used
-        ArchTag::kMinComputeCapability >= 80 && kIsHalf
-            ? 4
-            : DefaultConfig::kStages,
-        false, // SplitKSerial
-        typename GemmType::Operator>;
-    using WarpIteratorA = typename cutlass::gemm::threadblock::
-        DefaultWarpIteratorAFromSharedMemory<
-            typename DefaultGemm::Mma::Policy::Operator::Shape, // WarpShape
-            typename DefaultGemm::Mma::Policy::Operator::InstructionShape,
-            typename DefaultGemm::Mma::Policy::Operator::IteratorA,
-            typename DefaultGemm::Mma::Policy>::WarpIterator;
-    using DefaultMmaFromSmem =
-        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
-            typename DefaultGemm::Mma,
-            MM0::AccumulatorSharedStorage::Shape::kN, // kMaxK
-            WarpIteratorA,
-            false>; // kScaleOperandA
-    using Mma = typename DefaultMmaFromSmem::Mma;
-    using IteratorB = typename Mma::IteratorB;
-    using WarpCount = typename Mma::WarpCount;
-    static_assert(
-        WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock,
-        "");
-    using DefaultEpilogue = typename DefaultGemm::Epilogue;
-    using OutputTileIterator =
-        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
-            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
-            output_t>;
-    using OutputTileIteratorAccum =
-        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
-            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
-            output_accum_t>;
-  };
-  static constexpr int64_t kAlignmentQ = MM0::kAlignmentA;
-  static constexpr int64_t kAlignmentK = MM0::kAlignmentB;
-  static constexpr int64_t kAlignmentV = 1;
-  // Shared storage - depends on kernel params
-  struct ScalingCoefs {
-    cutlass::Array<accum_t, kQueriesPerBlock> m_prime;
-    cutlass::Array<accum_t, kQueriesPerBlock> s_prime;
-    cutlass::Array<accum_t, kQueriesPerBlock> mi;
-    cutlass::Array<accum_t, kQueriesPerBlock> out_rescale;
-    cutlass::Array<accum_t, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>
-        addition_storage;
-  };
-  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
-    struct SharedStorageAfterMM0 {
-      // Everything here might be overwritten during MM0
-      union {
-        typename MM0::BiasLoader::SmemTile bias;
-        typename MM0::AccumulatorSharedStorage si;
-      };
-      typename MM1::Mma::SharedStorage mm1;
-    };
-    union {
-      typename MM0::Mma::SharedStorage mm0;
-      SharedStorageAfterMM0 after_mm0;
-      typename MM1::DefaultEpilogue::SharedStorage epilogue;
-    };
-    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
-    epilogue_shared_storage() {
-      return epilogue;
-    }
-  };
-  struct SharedStorageEpilogueInLoop : ScalingCoefs {
-    struct SharedStorageAfterMM0 {
-      // Everything here might be overwritten during MM0
-      union {
-        typename MM0::BiasLoader::SmemTile bias;
-        typename MM0::AccumulatorSharedStorage si;
-      };
-      typename MM1::Mma::SharedStorage mm1;
-      typename MM1::DefaultEpilogue::SharedStorage epilogue;
-    };
-    union {
-      typename MM0::Mma::SharedStorage mm0;
-      SharedStorageAfterMM0 after_mm0;
-    };
-    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
-    epilogue_shared_storage() {
-      return after_mm0.epilogue;
-    }
-  };
-  using SharedStorage = typename cutlass::platform::conditional<
-      kSingleValueIteration || kKeepOutputInRF,
-      SharedStorageEpilogueAtEnd,
-      SharedStorageEpilogueInLoop>::type;
-  static bool __host__ check_supported(Params const& p) {
-    CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ);
-    CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK);
-    CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV);
-    if (kSupportsBias) {
-      CHECK_ALIGNED_PTR(p.attn_bias_ptr, kAlignmentQ);
-      XFORMERS_CHECK(
-          p.num_batches <= 1 || p.bias_strideB % kAlignmentQ == 0,
-          "attn_bias is not correctly aligned (strideB)");
-      XFORMERS_CHECK(
-          p.num_heads <= 1 || p.bias_strideH % kAlignmentQ == 0,
-          "attn_bias is not correctly aligned (strideH)");
-      XFORMERS_CHECK(
-          p.bias_strideM % kAlignmentQ == 0,
-          "attn_bias is not correctly aligned");
-    }
-    XFORMERS_CHECK(
-        p.q_strideM % kAlignmentQ == 0,
-        "query is not correctly aligned (strideM)");
-    XFORMERS_CHECK(
-        p.k_strideM % kAlignmentK == 0,
-        "key is not correctly aligned (strideM)");
-    XFORMERS_CHECK(
-        p.v_strideM % kAlignmentV == 0,
-        "value is not correctly aligned (strideM)");
-    XFORMERS_CHECK(
-        p.num_heads <= 1 || p.q_strideH % kAlignmentQ == 0,
-        "query is not correctly aligned (strideH)");
-    XFORMERS_CHECK(
-        p.num_heads <= 1 || p.k_strideH % kAlignmentK == 0,
-        "key is not correctly aligned (strideH)");
-    XFORMERS_CHECK(
-        p.num_heads <= 1 || p.v_strideH % kAlignmentV == 0,
-        "value is not correctly aligned (strideH)");
-    XFORMERS_CHECK(
-        p.custom_mask_type < NumCustomMaskTypes,
-        "invalid value for `custom_mask_type`");
-    return true;
-  }
-  static void CUTLASS_DEVICE attention_kernel(Params& p) {
-    // In this block, we will only ever:
-    // - read query[query_start:query_end, :]
-    // - write to output[query_start:query_end, :]
-    extern __shared__ char smem_buffer[];
-    SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
-    auto& m_prime = shared_storage.m_prime;
-    auto& s_prime = shared_storage.s_prime;
-    auto& mi = shared_storage.mi;
-    auto& out_rescale = shared_storage.out_rescale;
-    const uint32_t query_start = blockIdx.x * kQueriesPerBlock;
-    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
-    if (thread_id() < kQueriesPerBlock) {
-      s_prime[thread_id()] = accum_t(0);
-      out_rescale[thread_id()] = accum_t(1.0);
-      m_prime[thread_id()] =
-          -cutlass::platform::numeric_limits<accum_t>::infinity();
-      mi[thread_id()] = -cutlass::platform::numeric_limits<accum_t>::infinity();
-    }
-    typename MM1::Mma::FragmentC accum_o;
-    accum_o.clear();
-    auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
-      using OutputTileIterator = typename MM1::OutputTileIterator;
-      return OutputTileIterator(
-          typename OutputTileIterator::Params{(int32_t)p.o_strideM},
-          p.output_ptr,
-          typename OutputTileIterator::TensorCoord{
-              p.num_queries, p.head_dim_value},
-          thread_id(),
-          {0, col});
-    };
-    auto createOutputAccumIter = [&](int col) ->
-        typename MM1::OutputTileIteratorAccum {
-          using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
-          return OutputTileIteratorAccum(
-              typename OutputTileIteratorAccum::Params{
-                  (int32_t)(p.head_dim_value * p.num_heads)},
-              p.output_accum_ptr,
-              typename OutputTileIteratorAccum::TensorCoord{
-                  p.num_queries, p.head_dim_value},
-              thread_id(),
-              {0, col});
-        };
-#ifdef HAS_PYTORCH
-    curandStatePhilox4_32_10_t curand_state_init;
-    if (kSupportsDropout && p.use_dropout) {
-      const auto seeds = at::cuda::philox::unpack(p.rng_engine_inputs);
-      // each element of the attention matrix P with shape
-      // (batch_sz, n_heads, n_queries, n_keys) is associated with a single
-      // offset in RNG sequence. we initialize the RNG state with offset that
-      // starts at the beginning of a (n_queries, n_keys) matrix for this
-      // block's batch_id and head_id
-      // initializing rng state is very expensive, so we run once per kernel,
-      // rather than once per iteration. each iteration takes a copy of the
-      // initialized RNG state and offsets it as needed.
-      curand_init(
-          std::get<0>(seeds),
-          0,
-          std::get<1>(seeds) + p.dropout_batch_head_rng_offset,
-          &curand_state_init);
-    }
-#endif
-    // Iterate through keys
-    for (int32_t iter_key_start = 0; iter_key_start < p.num_keys;
-         iter_key_start += kKeysPerBlock) {
-      int32_t problem_size_0_m =
-          cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
-      int32_t problem_size_0_n = cutlass::fast_min(
-          int32_t(kKeysPerBlock), p.num_keys - iter_key_start);
-      int32_t const& problem_size_0_k = p.head_dim;
-      int32_t const& problem_size_1_n = p.head_dim_value;
-      int32_t const& problem_size_1_k = problem_size_0_n;
-      auto prologueV = [&](int blockN) {
-        typename MM1::Mma::IteratorB iterator_V(
-            typename MM1::IteratorB::Params{typename MM1::LayoutB(p.v_strideM)},
-            p.value_ptr + iter_key_start * p.v_strideM,
-            {problem_size_1_k, problem_size_1_n},
-            thread_id(),
-            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
-        MM1::Mma::prologue(
-            shared_storage.after_mm0.mm1,
-            iterator_V,
-            thread_id(),
-            problem_size_1_k);
-      };
-      __syncthreads(); // Need to have shared memory initialized, and `m_prime`
-                       // updated from end of prev iter
-      //
-      // MATMUL: Q.K_t
-      //
-      // Computes the block-matrix product of:
-      // (a) query[query_start:query_end, :]
-      // with
-      // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
-      // and stores that into `shared_storage.si`
-      //
-      // Compute threadblock location
-      cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0};
-      cutlass::MatrixCoord tb_offset_A{
-          tb_tile_offset.m() * MM0::Mma::Shape::kM, tb_tile_offset.k()};
-      cutlass::MatrixCoord tb_offset_B{
-          tb_tile_offset.k(), tb_tile_offset.n() * MM0::Mma::Shape::kN};
-      // Construct iterators to A and B operands
-      typename MM0::IteratorA iterator_A(
-          typename MM0::IteratorA::Params(
-              typename MM0::MmaCore::LayoutA(p.q_strideM)),
-          p.query_ptr,
-          {problem_size_0_m, problem_size_0_k},
-          thread_id(),
-          tb_offset_A);
-      typename MM0::IteratorB iterator_B(
-          typename MM0::IteratorB::Params(
-              typename MM0::MmaCore::LayoutB(p.k_strideM)),
-          p.key_ptr + iter_key_start * p.k_strideM,
-          {problem_size_0_k, problem_size_0_n},
-          thread_id(),
-          tb_offset_B);
-      auto my_warp_id = warp_uniform(warp_id());
-      auto my_lane_id = lane_id();
-      // Construct thread-scoped matrix multiply
-      typename MM0::Mma mma(
-          shared_storage.mm0, thread_id(), my_warp_id, my_lane_id);
-      typename MM0::Mma::FragmentC accum;
-      accum.clear();
-      auto gemm_k_iterations =
-          (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
-      // Compute threadblock-scoped matrix multiply-add
-      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
-      __syncthreads();
-      if (kPreloadV) {
-        prologueV(0);
-      } else {
-        MM1::Mma::drain_cp_asyncs();
-      }
-      typename MM0::Mma::Operator::IteratorC::TensorCoord
-          iteratorC_tile_offset = {
-              (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) +
-                  (my_warp_id % MM0::Mma::WarpCount::kM),
-              (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) +
-                  (my_warp_id / MM0::Mma::WarpCount::kM)};
-      // multiply by scaling factor
-      if (kSupportsBias) {
-        accum =
-            cutlass::multiplies<typename MM0::Mma::FragmentC>()(p.scale, accum);
-      }
-      // apply attention bias if applicable
-      if (kSupportsBias && p.attn_bias_ptr != nullptr) {
-        // load bias tile Bij into shared memory
-        typename MM0::BiasLoader::GmemTileIterator bias_iter(
-            {cutlass::layout::RowMajor(p.bias_strideM)},
-            // attn_bias_pointer points to matrix of size (n_queries, n_keys)
-            // for the relevant batch_id and head_id
-            p.attn_bias_ptr + query_start * p.bias_strideM + iter_key_start,
-            {problem_size_0_m, problem_size_0_n},
-            thread_id());
-        cutlass::TensorRef<scalar_t, cutlass::layout::RowMajor> bias_tensor_ref(
-            shared_storage.after_mm0.bias.data(),
-            cutlass::layout::RowMajor(MM0::ThreadblockShape::kN));
-        typename MM0::BiasLoader::SmemTileIterator smem_tile_iter(
-            bias_tensor_ref, thread_id());
-        MM0::BiasLoader::load(bias_iter, smem_tile_iter);
-        // Pij += Bij, Pij is in register fragment and Bij is in shared memory
-        auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
-            my_lane_id, my_warp_id, iteratorC_tile_offset);
-        MM0::AccumLambdaIterator::iterateRows(
-            lane_offset,
-            [&](int accum_m) {},
-            [&](int accum_m, int accum_n, int idx) {
-              if (accum_m < problem_size_0_m && accum_n < problem_size_0_n) {
-                accum[idx] += bias_tensor_ref.at({accum_m, accum_n});
-              }
-            },
-            [&](int accum_m) {});
-      }
-      // Mask out last if causal
-      // This is only needed if upper-right corner of current query / key block
-      // intersects the mask Coordinates of upper-right corner of current block
-      // is y=query_start x=min(iter_key_start + kKeysPerBlock, num_keys)) The
-      // first masked element is x = y + offset -> query_start + offset There is
-      // intersection (and we need to mask) if min(iter_key_start +
-      // kKeysPerBlock, num_keys)) >= query_start + offset
-      if (p.custom_mask_type &&
-          cutlass::fast_min(iter_key_start + kKeysPerBlock, p.num_keys) >=
-              (query_start + p.causal_diagonal_offset)) {
-        auto query_start = blockIdx.x * kQueriesPerBlock;
-        auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
-            my_lane_id, my_warp_id, iteratorC_tile_offset);
-        int32_t last_col;
-        MM0::AccumLambdaIterator::iterateRows(
-            lane_offset,
-            [&](int accum_m) {
-              // last absolute col is (last absolute query + offset)
-              // last local col is (last absolute query + offset -
-              // iter_key_start)
-              last_col = query_start + accum_m + p.causal_diagonal_offset -
-                  iter_key_start;
-            },
-            [&](int accum_m, int accum_n, int idx) {
-              if (accum_n > last_col) {
-                accum[idx] =
-                    -cutlass::platform::numeric_limits<accum_t>::infinity();
-              }
-            },
-            [&](int accum_m) {});
-      }
-      // Update `mi` from accum stored in registers
-      // Also does accum[i] <- exp(accum[i] - mi)
-      iterative_softmax<typename MM0::Mma::Operator::IteratorC>(
-          accum_o,
-          accum,
-          mi,
-          m_prime,
-          s_prime,
-          out_rescale,
-          shared_storage.addition_storage,
-          my_lane_id,
-          thread_id(),
-          my_warp_id,
-          p.num_keys - iter_key_start,
-          iter_key_start == 0,
-          iteratorC_tile_offset,
-          kSupportsBias ? 1.0f : p.scale);
-      // Output results to shared-memory
-      int warp_idx_mn_0 = my_warp_id %
-          (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
-      auto output_tile_coords = cutlass::MatrixCoord{
-          warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
-          warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
-      MM0::B2bGemm::accumToSmem(
-          shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords);
-      __syncthreads();
-#ifdef HAS_PYTORCH
-      // apply dropout (if applicable) after we've written Pij to smem.
-      // dropout is applied by multiplying each element of Pij by:
-      // - 0 with probability dropout_p
-      // - 1 / (1 - dropout_p) with probability 1 - dropout_p
-      //
-      // for backward purposes we want to be able to map each element of the
-      // attention matrix to the same random uniform number as the one we used
-      // in forward, without needing to use the same iteration order or having
-      // to store the dropout matrix. its possible to do this in registers but
-      // it ends up being very slow because each thread having noncontiguous
-      // strips of the Pij tile means we have to skip around a lot, and also
-      // have to generate a single random number at a time
-      if (kSupportsDropout && p.use_dropout) {
-        auto si = shared_storage.after_mm0.si.accum_ref();
-        // each thread handles a contiguous sequence of elements from Sij, all
-        // coming from the same row. the reason they have to come from the same
-        // row is that the sampling random numbers from a contiguous random
-        // number sequence is much more efficient than jumping around, and the
-        // linear offset of each element of S (the global matrix) maps to an
-        // offset in a random number sequence. for S, the end of a row and the
-        // beginning of the next have adjacent offsets, but for Sij, this is not
-        // necessarily the case.
-        const int num_threads = blockDim.x * blockDim.y * blockDim.z;
-        const int threads_per_row =
-            cutlass::fast_min(num_threads / problem_size_0_m, problem_size_0_n);
-        const int elts_per_thread = cutlass::round_nearest(
-            cutlass::ceil_div(problem_size_0_n, threads_per_row), 4);
-        const int thread_i = thread_id() / threads_per_row;
-        const int thread_start_j =
-            (thread_id() % threads_per_row) * elts_per_thread;
-        if (thread_i < problem_size_0_m && thread_start_j < problem_size_0_n) {
-          curandStatePhilox4_32_10_t curand_state = curand_state_init;
-          skipahead(
-              static_cast<unsigned long long>(
-                  (query_start + thread_i) * p.num_keys_absolute +
-                  (iter_key_start + thread_start_j)),
-              &curand_state);
-          const float dropout_scale = 1.0 / (1.0 - p.dropout_prob);
-          // apply dropout scaling to elements this thread is responsible for,
-          // in chunks of 4
-          for (int sij_start_col_idx = thread_start_j; sij_start_col_idx <
-               cutlass::fast_min(thread_start_j + elts_per_thread,
-                                 problem_size_0_n);
-               sij_start_col_idx += 4) {
-            const float4 rand_uniform_quad = curand_uniform4(&curand_state);
-            CUTLASS_PRAGMA_UNROLL
-            for (int quad_idx = 0; quad_idx < 4; ++quad_idx) {
-              si.at({thread_i, sij_start_col_idx + quad_idx}) *=
-                  static_cast<scalar_t>(
-                      dropout_scale *
-                      ((&rand_uniform_quad.x)[quad_idx] > p.dropout_prob));
-            }
-          }
-        }
-        __syncthreads(); // p.use_dropout should have same value kernel-wide
-      }
-#endif
-      //
-      // MATMUL: Attn . V
-      // Run the matmul `attn @ V` for a block of attn and V.
-      // `attn` is read from shared memory (in `shared_storage_si`)
-      // `V` is read from global memory (with iterator_B)
-      //
-      const int64_t nBlockN = kSingleValueIteration
-          ? 1
-          : ceil_div(
-                (int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN));
-      for (int blockN = 0; blockN < nBlockN; ++blockN) {
-        int gemm_k_iterations =
-            (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
-        // Compute threadblock-scoped matrix multiply-add and store it in accum
-        // (in registers)
-        if (!kPreloadV) {
-          __syncthreads(); // we share shmem between mma and epilogue
-        }
-        typename MM1::Mma::IteratorB iterator_V(
-            typename MM1::IteratorB::Params{typename MM1::LayoutB(p.v_strideM)},
-            p.value_ptr + iter_key_start * p.v_strideM,
-            {problem_size_1_k, problem_size_1_n},
-            thread_id(),
-            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
-        typename MM1::Mma mma_pv(
-            // operand A: Pij_dropped in shared memory
-            shared_storage.after_mm0.si.accum_ref(),
-            // operand B: shared memory staging area for Vj, which is loaded
-            // from global memory
-            shared_storage.after_mm0.mm1.operand_B_ref(),
-            (int)thread_id(),
-            (int)my_warp_id,
-            (int)my_lane_id);
-        mma_pv.set_prologue_done(kPreloadV);
-        if (!kKeepOutputInRF) {
-          accum_o.clear();
-        }
-        mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
-        __syncthreads();
-        if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) {
-          prologueV(blockN + 1);
-        }
-        if (!kKeepOutputInRF) {
-          MM1::Mma::drain_cp_asyncs();
-          DISPATCH_BOOL(
-              iter_key_start == 0, kIsFirst, ([&] {
-                DISPATCH_BOOL(
-                    (iter_key_start + kKeysPerBlock) >= p.num_keys,
-                    kIsLast,
-                    ([&] {
-                      using DefaultEpilogue = typename MM1::DefaultEpilogue;
-                      using DefaultOp =
-                          typename MM1::DefaultConfig::EpilogueOutputOp;
-                      using ElementCompute = typename DefaultOp::ElementCompute;
-                      using EpilogueOutputOp = typename cutlass::epilogue::
-                          thread::MemoryEfficientAttentionNormalize<
-                              typename cutlass::platform::conditional<
-                                  kIsLast::value,
-                                  output_t,
-                                  output_accum_t>::type,
-                              output_accum_t,
-                              DefaultOp::kCount,
-                              typename DefaultOp::ElementAccumulator,
-                              ElementCompute,
-                              kIsFirst::value,
-                              kIsLast::value,
-                              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
-                      using Epilogue = typename cutlass::epilogue::threadblock::
-                          EpiloguePipelined<
-                              typename DefaultEpilogue::Shape,
-                              typename MM1::Mma::Operator,
-                              DefaultEpilogue::kPartitionsK,
-                              typename cutlass::platform::conditional<
-                                  kIsLast::value,
-                                  typename MM1::OutputTileIterator,
-                                  typename MM1::OutputTileIteratorAccum>::type,
-                              typename DefaultEpilogue::
-                                  AccumulatorFragmentIterator,
-                              typename DefaultEpilogue::WarpTileIterator,
-                              typename DefaultEpilogue::SharedLoadIterator,
-                              EpilogueOutputOp,
-                              typename DefaultEpilogue::Padding,
-                              DefaultEpilogue::kFragmentsPerIteration,
-                              true, // IterationsUnroll
-                              typename MM1::OutputTileIteratorAccum // Read
-                                                                    // iterator
-                              >;
-                      int col = blockN * MM1::Mma::Shape::kN;
-                      auto source_iter = createOutputAccumIter(col);
-                      auto dest_iter = call_conditional<
-                          kIsLast::value,
-                          decltype(createOutputIter),
-                          decltype(createOutputAccumIter)>::
-                          apply(createOutputIter, createOutputAccumIter, col);
-                      EpilogueOutputOp rescale(s_prime, out_rescale);
-                      Epilogue epilogue(
-                          shared_storage.epilogue_shared_storage(),
-                          thread_id(),
-                          my_warp_id,
-                          my_lane_id);
-                      epilogue(rescale, dest_iter, accum_o, source_iter);
-                    }));
-              }));
-          if (!kSingleValueIteration) {
-            __syncthreads();
-          }
-        }
-      }
-      __syncthreads(); // we modify `m_prime` after
-    }
-    if (kKeepOutputInRF) {
-      constexpr bool kIsFirst = true;
-      constexpr bool kIsLast = true;
-      using DefaultEpilogue = typename MM1::DefaultEpilogue;
-      using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
-      using ElementCompute = typename DefaultOp::ElementCompute;
-      using EpilogueOutputOp =
-          typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize<
-              output_t, // output
-              output_accum_t, // source
-              DefaultOp::kCount,
-              typename DefaultOp::ElementAccumulator, // accum
-              output_accum_t, // compute
-              kIsFirst,
-              kIsLast,
-              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
-      using Epilogue =
-          typename cutlass::epilogue::threadblock::EpiloguePipelined<
-              typename DefaultEpilogue::Shape,
-              typename MM1::Mma::Operator,
-              DefaultEpilogue::kPartitionsK,
-              typename MM1::OutputTileIterator, // destination
-              typename DefaultEpilogue::AccumulatorFragmentIterator,
-              typename DefaultEpilogue::WarpTileIterator,
-              typename DefaultEpilogue::SharedLoadIterator,
-              EpilogueOutputOp,
-              typename DefaultEpilogue::Padding,
-              DefaultEpilogue::kFragmentsPerIteration,
-              true, // IterationsUnroll
-              typename MM1::OutputTileIteratorAccum // source tile
-              >;
-      auto dest_iter = createOutputIter(0);
-      EpilogueOutputOp rescale(s_prime, out_rescale);
-      Epilogue epilogue(
-          shared_storage.epilogue_shared_storage(),
-          thread_id(),
-          warp_id(),
-          lane_id());
-      MM1::Mma::drain_cp_asyncs();
-      epilogue(rescale, dest_iter, accum_o);
-    }
-    // 7. Calculate logsumexp
-    // To make the backward easier, we pad logsumexp with `inf`
-    // this avoids a few bound checks, and is not more expensive during fwd
-    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
-    if (p.logsumexp_ptr && thread_id() < kQueriesPerBlock) {
-      auto lse_dim = ceil_div((int32_t)p.num_queries, kAlignLSE) * kAlignLSE;
-      constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
-      if (thread_id() < p.num_queries) {
-        p.logsumexp_ptr[thread_id()] = accum_t(mi[thread_id()] / kLog2e) +
-            cutlass::fast_log(accum_t(s_prime[thread_id()]));
-      } else if (thread_id() < lse_dim) {
-        p.logsumexp_ptr[thread_id()] =
-            cutlass::platform::numeric_limits<accum_t>::infinity();
-      }
-    }
-  }
-  template <typename WarpIteratorC>
-  CUTLASS_DEVICE static void iterative_softmax(
-      typename WarpIteratorC::Fragment& frag_o, // output so far
-      typename WarpIteratorC::Fragment& frag,
-      cutlass::Array<accum_t, kQueriesPerBlock>& mi,
-      cutlass::Array<accum_t, kQueriesPerBlock>& m_prime,
-      cutlass::Array<accum_t, kQueriesPerBlock>& s_prime,
-      cutlass::Array<accum_t, kQueriesPerBlock>& out_rescale,
-      cutlass::Array<accum_t, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>&
-          addition_storage,
-      int8_t lane_id,
-      int8_t thread_id,
-      int8_t warp_id,
-      int max_col,
-      bool is_first,
-      typename WarpIteratorC::TensorCoord const& tile_offset,
-      float scaling) {
-    /* Iterates on the accumulator and corresponding position on result matrix
-    (1) Update `mi[r]` to the max value of the row `r`
-    (2) In a second iteration do the following:
-        (a) accum   <- exp(accum - mi)
-        (b) m_prime <- exp(m_prime - mi)
-        (c) s_prime <- s_prime * m_prime + sum(accum)
-    All of this is done on registers, before we store all of this
-    on shared memory for the next matmul with Value.
-    */
-    using Fragment = typename WarpIteratorC::Fragment;
-    using LambdaIterator = typename DefaultMmaAccumLambdaIterator<
-        WarpIteratorC,
-        accum_t,
-        kWarpSize>::Iterator;
-    // Convert to `accum_t` (rather than double)
-    constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
-    static_assert(kQueriesPerBlock % kNumWarpsPerBlock == 0, "");
-    static constexpr int kLinesPerWarp = kQueriesPerBlock / kNumWarpsPerBlock;
-    frag = cutlass::multiplies<Fragment>()(scaling * kLog2e, frag);
-    auto lane_offset =
-        LambdaIterator::get_lane_offset(lane_id, warp_id, tile_offset);
-    // First update `mi` to the max per-row
-    {
-      accum_t max;
-      LambdaIterator::iterateRows(
-          lane_offset,
-          [&](int accum_m) {
-            max = -cutlass::platform::numeric_limits<accum_t>::infinity();
-          },
-          [&](int accum_m, int accum_n, int idx) {
-            if (accum_n < max_col) {
-              max = cutlass::fast_max(max, frag[idx]);
-            }
-          },
-          [&](int accum_m) {
-            // Having 4x atomicMax seems faster than reduce within warp
-            // first...
-            atomicMaxFloat(&mi[accum_m], max);
-          });
-    }
-    // Make sure we all share the update values for `mi`
-    __syncthreads();
-    // Doing this `exp` is quite expensive. Let's
-    // split it across the warps
-    bool restore_mi_to_minus_inf = false;
-    if (lane_id < kLinesPerWarp) {
-      int id = warp_id * kLinesPerWarp + lane_id;
-      auto m_prime_id = m_prime[id];
-      auto mi_id = mi[id];
-      bool changed = m_prime_id < mi_id; // `false` if both are -inf
-      if (changed) {
-        auto m_prime_exp = exp2f(m_prime_id - mi_id);
-        out_rescale[id] = m_prime_exp;
-        s_prime[id] *= m_prime_exp;
-      } else {
-        // Only when bias is enabled, it's possible that all the first values
-        // of attention are masked to `-inf`. In that case we want to avoid
-        // `nan = exp2f(-inf - (-inf))` so we temporarily set `mi` to 0
-        if (kSupportsBias &&
-            mi_id == -cutlass::platform::numeric_limits<accum_t>::infinity()) {
-          restore_mi_to_minus_inf = true;
-          mi[id] = 0.0f;
-        }
-        out_rescale[id] = 1.0f;
-      }
-    }
-    __syncthreads(); // Update output fragments
-    if (kKeepOutputInRF && !is_first) {
-      accum_t line_rescale;
-      LambdaIterator::iterateRows(
-          lane_offset,
-          [&](int accum_m) { line_rescale = out_rescale[accum_m]; },
-          [&](int accum_m, int accum_n, int idx) {
-            frag_o[idx] = frag_o[idx] * line_rescale;
-          },
-          [&](int accum_m) {});
-    }
-    // Update accum_m, accum_n, ...
-    {
-      accum_t mi_row, total_row;
-      LambdaIterator::iterateRows(
-          lane_offset,
-          [&](int accum_m) { mi_row = mi[accum_m]; },
-          [&](int accum_m, int accum_n, int idx) {
-            frag[idx] =
-                (accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0);
-          },
-          [&](int accum_m) {});
-      LambdaIterator::iterateRows(
-          lane_offset,
-          [&](int accum_m) { total_row = 0.0; },
-          [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; },
-          [&](int accum_m) {
-            if (LambdaIterator::reduceSameRow(
-                    lane_id, total_row, [](accum_t a, accum_t b) {
-                      return a + b;
-                    })) {
-              // NOTE: we could atomically add `total_row` to `s_prime`, but
-              // it's faster (and deterministic) to avoid atomics here
-              addition_storage
-                  [accum_m + kQueriesPerBlock * tile_offset.column()] =
-                      total_row;
-            }
-          });
-    }
-    __syncthreads();
-    if (lane_id < kLinesPerWarp) {
-      int id = warp_id * kLinesPerWarp + lane_id;
-      accum_t total_row = s_prime[id];
-      if (restore_mi_to_minus_inf) {
-        // Restore `mi`, see above when we set `restore_mi_to_minus_inf=true`
-        mi[id] = -cutlass::platform::numeric_limits<accum_t>::infinity();
-      } else {
-        m_prime[id] = mi[id];
-      }
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) {
-        total_row += addition_storage[id + kQueriesPerBlock * i];
-      }
-      s_prime[id] = total_row;
-    }
-  }
-  static CUTLASS_DEVICE int8_t lane_id() {
-    return threadIdx.x;
-  }
-  static CUTLASS_DEVICE int8_t warp_id() {
-    return threadIdx.y;
-  }
-  static CUTLASS_DEVICE int16_t thread_id() {
-    return threadIdx.x + threadIdx.y * blockDim.x;
-  }
-};
-template <typename AK>
-__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
-    attention_kernel_batched_impl(typename AK::Params p) {
-  if (!p.advance_to_block()) {
-    return;
-  }
-  AK::attention_kernel(p);
-}
-template <typename AK>
-__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
-    attention_kernel_batched(typename AK::Params params);

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/piped_subprocess.py DELETED Viewed

@@ -1,144 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-from typing import List
-import torch
-import subprocess
-import sys
-import tempfile
-import os
-import numpy as np
-TORCH_DTYPE_NAME = {
-    torch.float32: "f32",
-    torch.float16: "f16",
-    torch.bfloat16: "b16"
-}
-NAME_TORCH_DTYPE = {v: k for k, v in TORCH_DTYPE_NAME.items()}
-def _tensor_from_storage(tensor: torch.Tensor, dtype) -> torch.Tensor:
-    # PyTorch >= 2.0
-    if hasattr(tensor, 'untyped_storage'):
-        return torch.tensor([], dtype=dtype).set_(tensor.untyped_storage())
-    return torch.tensor([], dtype=dtype).set_(tensor.storage().untyped())
-class PipedSubprocess:
-    def __init__(self, binary: str) -> None:
-        self.binary = binary
-        self.tempdir_ctx = tempfile.TemporaryDirectory()
-    def __enter__(self) -> "PipedSubprocess":
-        self.subp = subprocess.Popen(self.binary, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=sys.stderr, text=True, bufsize=0)
-        self.tempdir = self.tempdir_ctx.__enter__()
-        self.file_counter = 0
-        return self
-    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
-        self.tempdir_ctx.__exit__(exc_type, exc_val, exc_tb)
-    def temp_filename(self, suffix: str) -> str:
-        self.file_counter += 1
-        return os.path.join(self.tempdir, f"{self.file_counter}{suffix}")
-    def write(self, *args) -> None:
-        for a in args:
-            self.subp.stdin.write(str(a) + " ")
-    def writeTensor(self, tensor: torch.Tensor, name: str, stride_names: List[str]) -> None:
-        print(f"Py ->C++: {TORCH_DTYPE_NAME[tensor.dtype]}:{name}")
-        tensor_u8 = _tensor_from_storage(tensor, torch.uint8)
-        self.write("tensor_begin", f"{TORCH_DTYPE_NAME[tensor.dtype]}:{name}", tensor_u8.shape[0])
-        filename = self.temp_filename(f"{name}.tensor")
-        assert tensor.storage_offset() == 0
-        with open(filename, "wb+") as fd:
-            fd.write(bytes(tensor_u8.numpy()))
-        self.write("file", filename)
-        self.write("tensor_end")
-        for stride_name, stride_value in zip(stride_names, tensor.stride()):
-            self.write(stride_name, stride_value)
-    def readTensor(self, name, stride_name, shape) -> torch.Tensor:
-        tmpfile = self.temp_filename(f"{name}.tensor")
-        self.write("tmpfile", tmpfile)
-        self.readExpect("tensor_begin")
-        dtype_str, name = self.read().split(":")
-        print(f"C++->Py : {dtype_str}:{name}")
-        u8len = int(self.read())
-        dtype = NAME_TORCH_DTYPE[dtype_str]
-        self.readExpect("file")
-        self.readExpect(tmpfile)
-        with open(tmpfile, "rb") as fd:
-            data = fd.read(u8len)
-            # `np.array` is not strictly needed, but avoids a torch warning
-            tensor_u8 = torch.frombuffer(np.array(data), dtype=torch.uint8, count=u8len)
-        self.readExpect("tensor_end")
-        tensor = _tensor_from_storage(tensor_u8, dtype)
-        strides = []
-        for sn in stride_name:
-            self.readExpect(sn)
-            strides.append(int(self.read()))
-        if len(strides) != shape:
-            strides.append(1)
-        assert len(strides) == len(shape), name
-        return torch.as_strided(tensor, shape, strides)
-    def readNamed(self, name: str):
-        self.readExpect(name)
-        return self.read()
-    def readExpect(self, what: str) -> None:
-        r = self.read()
-        if r != what:
-            raise ValueError(f"Read {r} but expected {what}")
-    def read(self):
-        read_all = []
-        # Skip initial whitespace
-        while True:
-            r = self.subp.stdout.read(1)
-            if r not in [' ', "\n"]:
-                read_all.append(r)
-                break
-        # Read data
-        while True:
-            r = self.subp.stdout.read(1)
-            if r in [' ', "\n"]:
-                break
-            read_all.append(r)
-        return ''.join(read_all)

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/41_fused_multi_head_attention/transform/tile_smem_loader.h DELETED Viewed

@@ -1,90 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include <cutlass/cutlass.h>
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-template <
-    typename scalar_t, // scalar type
-    typename ThreadblockTileShape, // size of tile to load
-    int Threads, // number of participating threads
-    int ElementsPerAccess> // thread access width in elements
-class TileSmemLoader {
- public:
-  using SmemTile =
-      cutlass::AlignedBuffer<scalar_t, ThreadblockTileShape::kCount>;
-  using ThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
-      cutlass::layout::PitchLinearShape<
-          ThreadblockTileShape::kColumn, // contiguous
-          ThreadblockTileShape::kRow>, // strided
-      Threads, // Threads
-      ElementsPerAccess>; // ElementsPerAccess
-  using GmemTileIterator =
-      cutlass::transform::threadblock::PredicatedTileIterator<
-          ThreadblockTileShape, // Shape
-          scalar_t, // Element
-          cutlass::layout::RowMajor, // Layout
-          0, // AdvanceRank
-          ThreadMap>; // ThreadMap
-  using SmemTileIterator = cutlass::transform::threadblock::RegularTileIterator<
-      ThreadblockTileShape, // Shape
-      scalar_t, // Element
-      cutlass::layout::RowMajor, // Layout
-      0, // AdvanceRank
-      ThreadMap>; // ThreadMap
-  using Fragment = typename GmemTileIterator::Fragment;
-  /// load a tile from global memory into shared memory
-  CUTLASS_DEVICE
-  static void load(
-      GmemTileIterator tile_load_iter,
-      SmemTileIterator tile_store_iter) {
-    Fragment tb_frag;
-    tb_frag.clear();
-    tile_load_iter.load(tb_frag);
-    tile_store_iter.store(tb_frag);
-    __syncthreads();
-  }
-};

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_bias_act_epilogue_tensor_op.h DELETED Viewed

@@ -1,154 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-*/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/thread/linear_combination_clamp.h"
-#include "cutlass/epilogue/thread/conversion_op.h"
-#include "cutlass/epilogue/thread/reduction_op.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
-#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
-#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
-#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
-#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
-#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
-#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
-// #include "cutlass/epilogue/threadblock/epilogue.h"
-#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
-#include "fused_bias_act_epilogue.h"
-#include "../warp/fused_bias_act_fragment_iterator_tensor_op.h"
-#include "output_tile_thread_map_for_fused_bias.h"
-#include "default_thread_map_tensor_op_for_fused_bias.h"
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-/// Defines sensible defaults for epilogues for TensorOps.
-template <
-  typename Shape_,
-  typename WarpMmaTensorOp_,
-  int PartitionsK,
-  typename OutputOp_,
-  int ElementsPerAccess
->
-struct DefaultFusedBiasActEpilogueTensorOp {
-  using Shape = Shape_;
-  using WarpMmaTensorOp = WarpMmaTensorOp_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputOp = OutputOp_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  using ElementOutput = typename OutputOp::ElementOutput;
-  using LayoutC = typename WarpMmaTensorOp::LayoutC;
-  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
-  //
-  // Thread map
-  //
-  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOpForFusedBias<
-    Shape,
-    typename WarpMmaTensorOp::Shape,
-    kPartitionsK,
-    ElementOutput,
-    kElementsPerAccess
-  >::Type;
-  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
-    OutputTileThreadMap,
-    ElementOutput
-  >;
-  using AccumulatorFragmentIterator = typename std::conditional<is_complex<ElementOutput>::value,
-                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
-                                        typename WarpMmaTensorOp::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
-                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-                                        LayoutC>,
-                                    cutlass::epilogue::warp::FusedBiasActFragmentIteratorTensorOp<
-                                        typename WarpMmaTensorOp::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
-                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
-                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
-                                        LayoutC> >::type;
-  //
-  // Define the epilogue
-  //
-  using Epilogue = cutlass::epilogue::threadblock::FusedBiasActEpilogue<
-    Shape,
-    WarpMmaTensorOp,
-    kPartitionsK,
-    OutputTileIterator,
-    AccumulatorFragmentIterator,
-    OutputOp
-  >;
-};
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_thread_map_tensor_op_for_fused_bias.h DELETED Viewed

@@ -1,113 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief
-*/
-#pragma once
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/layout/pitch_linear.h"
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-////////////////////////////////////////////////////////////////////////////////
-/// Defines the optimal thread map for TensorOp accumulator layouts
-template <
-  typename ThreadblockShape_,
-  typename WarpShape_,
-  int PartitionsK,
-  typename Element_,
-  int ElementsPerAccess
->
-struct DefaultThreadMapTensorOpForFusedBias {
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  static int const kPartitionsK = PartitionsK;
-  using Element = Element_;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  //
-  // Definitions
-  //
-  struct Detail {
-    /// Tensor Operations fundamentally perform operations on 8 rows
-    static int const kTensorOpRows = 8;
-    static int const kWarpSize = 32;
-    static_assert(
-      !(ThreadblockShape::kM % WarpShape::kM) &&
-      !(ThreadblockShape::kM % WarpShape::kM), "Divisibility");
-    /// Number of warps
-    using WarpCount = gemm::GemmShape<
-      ThreadblockShape::kM / WarpShape::kM,
-      ThreadblockShape::kN / WarpShape::kN,
-      kPartitionsK
-    >;
-    /// Number of participating threads
-    static int const kThreads = WarpCount::kCount * kWarpSize;
-  };
-  //
-  // ThreadMap
-  //
-  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
-  using Type = OutputTileOptimalThreadMapBiasAct <
-    OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
-    OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
-    Detail::kThreads,
-    kElementsPerAccess,
-    sizeof_bits<Element>::value
-  >;
-};
-///////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h DELETED Viewed

@@ -1,213 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-*/
-#pragma once
-#include "cutlass/cutlass.h"
-#include CUDA_STD_HEADER(cassert)
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/functional.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-#include "cutlass/epilogue/threadblock/epilogue_base.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-////////////////////////////////////////////////////////////////////////////////
-/// Epilogue operator without splitk
-template <
-  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
-  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  int PartitionsK,                          ///< Number of partitions of the K dimension
-  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
-  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
-  typename OutputOp_                        ///< Output operator
->
-class FusedBiasActEpilogue {
-public:
-  using Shape = Shape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  static int const kPartitionsK = PartitionsK;
-  using OutputTileIterator = OutputTileIterator_;
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using OutputOp = OutputOp_;
-  /// Output layout is always row-major
-  using Layout = layout::RowMajor;
-  using LongIndex = typename Layout::LongIndex;
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
-  /// Output element
-  using ElementOutput = typename OutputTileIterator::Element;
-  /// Output access size
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-public:
-  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
-  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
-    "Divisibility");
-public:
-  /// Constructor
-  CUTLASS_DEVICE
-  FusedBiasActEpilogue(
-  ){ }
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                    ///< Output operator
-    AccumulatorTile &accumulators,          ///< Complete warp-level accumulator tile
-    AccumulatorTile & fused_bias_act_accumlators,
-    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-    bool need_bias = output_op.is_source_needed();
-    if (need_bias)
-      compute_source_needed_(output_op, accumulators, fused_bias_act_accumlators, source_iterator);
-    else
-      compute_source_no_needed_(output_op, accumulators, fused_bias_act_accumlators);
-  }
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp const &output_op,                    ///< Output operator
-    AccumulatorTile &accumulators,          ///< Complete warp-level accumulator tile
-    AccumulatorTile & fused_bias_act_accumlators) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-    compute_source_no_needed_(output_op, accumulators, fused_bias_act_accumlators);
-  }
-  CUTLASS_DEVICE
-  void compute_source_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-    AccumulatorTile &accumulators,          ///< Complete warp-level accumulator tile
-    AccumulatorTile & fused_bias_act_accumlators,
-    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-    typename OutputTileIterator::Fragment source_fragment;
-    source_fragment.clear();
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-    AccumulatorFragmentIterator fused_bias_act_fragment_iterator(fused_bias_act_accumlators);
-    CUTLASS_PRAGMA_UNROLL
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
-      source_iterator.load(source_fragment);
-      ++source_iterator;
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
-      accum_fragment_iterator.load(accum_fragment);
-      ++accum_fragment_iterator;
-      typename AccumulatorFragmentIterator::Fragment fused_bias_act_fragment;
-      fused_bias_act_fragment = output_op(accum_fragment, source_fragment);
-      fused_bias_act_fragment_iterator.store(fused_bias_act_fragment);
-      ++fused_bias_act_fragment_iterator;
-    }
-  }
-  CUTLASS_DEVICE
-  void compute_source_no_needed_(
-    OutputOp const &output_op,                    ///< Output operator
-    AccumulatorTile &accumulators,          ///< Complete warp-level accumulator tile
-    AccumulatorTile & fused_bias_act_accumlators) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
-    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
-    AccumulatorFragmentIterator fused_bias_act_fragment_iterator(fused_bias_act_accumlators);
-    CUTLASS_PRAGMA_UNROLL
-    for (int iter = 0; iter < AccumulatorFragmentIterator::kIterations; ++iter) {
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
-      accum_fragment_iterator.load(accum_fragment);
-      ++accum_fragment_iterator;
-      typename AccumulatorFragmentIterator::Fragment fused_bias_act_fragment;
-      fused_bias_act_fragment = output_op(accum_fragment);
-      fused_bias_act_fragment_iterator.store(fused_bias_act_fragment);
-      ++fused_bias_act_fragment_iterator;
-    }
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/output_tile_thread_map_for_fused_bias.h DELETED Viewed

@@ -1,311 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Metaprogram for determining the mapping of output elements to threads for epilogue tiles.
-*/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-/// RowArrangement determines how one or more warps cover a region of consecutive rows.
-template <
-  typename Shape,
-  int WarpsRemaining,
-  int ElementsPerAccess,
-  int ElementSize,
-  bool Is2dTile
->
-struct RowArrangementBiasAct;
-/// RowArrangement in which each warp's access is a 1D tiled arrangement.
-template <
-  typename Shape,
-  int WarpsRemaining,
-  int ElementsPerAccess,
-  int ElementSize
->
-struct RowArrangementBiasAct<Shape, WarpsRemaining, ElementsPerAccess, ElementSize, false> {
-  static int const kWarpSize = 32;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kElementSize = ElementSize;
-  static int const kIterationsRow = 1;
-  static int const kDeltaRow = 1;
-  static int const kIterationsColumn = Shape::kColumn / kElementsPerAccess / kWarpSize;
-  static int const kDeltaColumn = kWarpSize * kElementsPerAccess;
-  static int const kAccessWidth = kWarpSize;
-  static int const kAccessRows = 1;
-  static int const kWarpPartitionsRow = 1;
-  static int const kWarpPartitionsColumn = WarpsRemaining;
-};
-/// RowArrangement in which each warp's access is a 2D tiled arrangement.
-template <
-  typename Shape,
-  int WarpsRemaining,
-  int ElementsPerAccess,
-  int ElementSize
->
-struct RowArrangementBiasAct<Shape, WarpsRemaining, ElementsPerAccess, ElementSize, true> {
-  static int const kMemoryAccessSize = 4;//128;
-  static int const kWarpSize = 32;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kElementSize = ElementSize;
-  struct Detail {
-    static int const kShapeRow = Shape::kRow / WarpsRemaining;
-    static int const kShapeWidth = Shape::kColumn / kElementsPerAccess;
-    static int const kTargetMemoryAccessWidth =
-      kMemoryAccessSize / (kElementsPerAccess * kElementSize / 8);
-    static int const kTargetAccessRows = kWarpSize / kTargetMemoryAccessWidth;
-  };
-  static int const kAccessWidth =
-    (Detail::kTargetAccessRows > Detail::kShapeRow ?
-      kWarpSize / Detail::kShapeRow
-      : const_min(
-          Detail::kShapeWidth,
-        const_min(kWarpSize, kMemoryAccessSize / (kElementsPerAccess * kElementSize / 8))
-        ));
-  static int const kAccessRows =
-    (Detail::kTargetAccessRows > Detail::kShapeRow ?
-      Detail::kShapeRow
-      : const_min(Shape::kRow, kWarpSize / kAccessWidth));
-  static int const kIterationsRow = Detail::kShapeRow / kAccessRows;
-  static int const kDeltaRow = kAccessRows;
-  static int const kIterationsColumn = Detail::kShapeWidth / kAccessWidth;
-  static int const kDeltaColumn = kAccessWidth * kElementsPerAccess;
-  static_assert( kAccessWidth * kElementsPerAccess <= Shape::kColumn, "Accessing too many elements per access");
-  static_assert( kIterationsColumn > 0, "Iteration Count Column must be > 0" );
-  static_assert( kIterationsRow > 0, "Iteration Count Row must be > 0" );
-  static int const kWarpPartitionsRow = 1;
-  static int const kWarpPartitionsColumn = 1;
-};
-}
-////////////////////////////////////////////////////////////////////////////////
-/// Template metaprogram for partitioning a 4D space across warps to achieve several performance
-/// objectives:
-///
-///   - coalesced memory accesses in units of 16 Byte lines
-///   - minimal address arithmetic
-///   - minimal predicate calculations
-///
-template <
-  typename Shape_,
-  typename Count_,
-  int Threads,
-  int ElementsPerAccess,
-  int ElementSize
->
-struct OutputTileOptimalThreadMapBiasAct {
-  using Shape = Shape_;
-  using Count = Count_;
-  static int const kWarpSize = 32;
-  static int const kThreads = Threads;
-  static int const kWarpCount = kThreads / kWarpSize;
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kElementSize = ElementSize;
-  //
-  // Metaprogram computation
-  //
-  struct Detail {
-    // Clusters
-    static int const kIterationsCluster =
-      ((Shape::kCluster > kWarpCount) ?
-        Shape::kCluster / kWarpCount
-        : 1);
-    static int const kDeltaCluster =
-      ((Shape::kCluster > kWarpCount) ?
-        Shape::kRow * Count::kRow * Shape::kGroup * Count::kGroup * Shape::kCluster / kIterationsCluster
-        : 1);
-    static int const kCompactedDeltaCluster =
-      ((Shape::kCluster > kWarpCount) ?
-        Shape::kRow * Shape::kGroup * Shape::kCluster / kIterationsCluster
-        : 1);
-    static int const kWarpPartitionsCluster =
-      ((Shape::kCluster > kWarpCount) ?
-        kWarpCount
-        : kWarpCount / Shape::kCluster);
-    static int const kWarpsRemainingForGroups =
-      ((Shape::kCluster > kWarpCount) ? 1 : kWarpCount / Shape::kCluster);
-    // Groups
-    static int const kIterationsGroup =
-      ((Shape::kGroup > kWarpsRemainingForGroups) ?
-        Shape::kGroup / kWarpsRemainingForGroups
-        : 1);
-    static int const kDeltaGroup =
-      ((Shape::kGroup > kWarpsRemainingForGroups) ?
-        Shape::kRow * Count::kRow * Shape::kGroup / kIterationsGroup
-        : 1);
-    static int const kCompactedDeltaGroup =
-      ((Shape::kGroup > kWarpsRemainingForGroups) ?
-        Shape::kRow * Shape::kGroup / kIterationsGroup
-        : 1);
-    static int const kWarpPartitionsGroup =
-      ((Shape::kGroup > kWarpsRemainingForGroups) ?
-        1
-        : kWarpsRemainingForGroups / Shape::kGroup);
-    static int const kWarpsRemainingForRows =
-      ((Shape::kGroup > kWarpsRemainingForGroups) ?
-        1
-        : kWarpsRemainingForGroups / Shape::kGroup);
-    // Rows
-    using RowArrangement = detail::RowArrangementBiasAct<
-      Shape,
-      kWarpsRemainingForRows,
-      kElementsPerAccess,
-      kElementSize,
-      (Shape::kRow > kWarpsRemainingForRows)
-    >;
-    // Warp partitions
-    using WarpPartitions = OutputTileShape<
-      RowArrangement::kWarpPartitionsColumn,
-      RowArrangement::kWarpPartitionsRow,
-      kWarpPartitionsGroup,
-      kWarpPartitionsCluster,
-      1>;
-    static int const kAccessWidth = RowArrangement::kAccessWidth;
-    static int const kAccessRows = RowArrangement::kAccessRows;
-  };
-  //
-  // Output
-  //
-  using Iterations = OutputTileShape<
-    Detail::RowArrangement::kIterationsColumn,
-    Detail::RowArrangement::kIterationsRow,
-    Detail::kIterationsGroup,
-    Detail::kIterationsCluster,
-    1>;
-  using Delta = OutputTileShape<
-    Detail::RowArrangement::kDeltaColumn,
-    Detail::RowArrangement::kDeltaRow,
-    Detail::kDeltaGroup,
-    Detail::kDeltaCluster,
-    1>;
-  /// Initial offset function
-  CUTLASS_HOST_DEVICE
-  static MatrixCoord initial_offset(int thread_idx) {
-    int warp_idx = thread_idx / kWarpSize;
-    int lane_idx = thread_idx % kWarpSize;
-    // Compute warp location
-    int cluster_idx = warp_idx / Detail::WarpPartitions::kCluster;
-    int residual_cluster = warp_idx % Detail::WarpPartitions::kCluster;
-    int group_idx = residual_cluster / Detail::WarpPartitions::kGroup;
-    int residual_group = residual_cluster % Detail::WarpPartitions::kGroup;
-    int row_idx = residual_group / Detail::WarpPartitions::kRow;
-    int col_idx = residual_group % Detail::WarpPartitions::kRow;
-    // Compute per-lane offset
-    int lane_row_offset = lane_idx / Detail::kAccessWidth;
-    int lane_col_offset = lane_idx % Detail::kAccessWidth;
-    // Compute coordinate in output space
-    int cluster_offset = cluster_idx * Shape::kRow * Count::kRow * Shape::kGroup * Count::kGroup;
-    int group_offset = group_idx * Shape::kRow * Count::kRow;
-    int row_offset = row_idx * Iterations::kRow * Detail::kAccessRows;
-    int column_offset = col_idx * Iterations::kColumn * Detail::kAccessWidth * kElementsPerAccess;
-    return MatrixCoord(
-      cluster_offset + group_offset + row_offset + lane_row_offset,
-      (column_offset + lane_col_offset) * kElementsPerAccess
-    );
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/warp/fused_bias_act_fragment_iterator_tensor_op.h DELETED Viewed

@@ -1,189 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
-      that participate in one warp-level store operation.
-      Typically, the accumulator tile is the largest single block of register-backed storage
-      within the kernel. Storing it to memory is best accomplished by partitioning it into
-      smaller tiles and storing these sequentially.
-      Round trips through shared memory during the Epilogue phase require partitioning, as
-      shared memory capacity is typically insufficient for a threadblock's total accumulator
-      size.
-*/
-#pragma once
-#include "cutlass/array.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/epilogue/warp/tensor_op_policy.h"
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace epilogue {
-namespace warp {
-////////////////////////////////////////////////////////////////////////////////
-///
-template <
-  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
-  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
-  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
-  typename Layout             ///< target shared memory layout
->
-class FusedBiasActFragmentIteratorTensorOp;
-////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for row-major shared memory
-template <
-  typename WarpShape_,         ///< shape of the warp-level GEMM tile
-  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
-  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
-  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
->
-class FusedBiasActFragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
-public:
-  using WarpShape = WarpShape_;
-  using OperatorShape = OperatorShape_;
-  using OperatorElementC = OperatorElementC_;
-  using OperatorFragmentC = OperatorFragmentC_;
-  using Layout = layout::RowMajor;
-  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<
-    OperatorElementC,
-    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
-  /// This is the complete warp-level accumulator tile.
-  using AccumulatorTile = Array<
-    OperatorElementC,
-    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
-  using OutputAccumulatorTile = AccumulatorTile;
-  /// Number of times this iterator can be incremented
-  static int const kIterations = Policy::kIterations;
-private:
-  /// Internal access type
-  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
-private:
-  //
-  // Data members
-  //
-  /// Accumulator tile
-  AccessType *accumulators_;
-  /// Internal index
-  int index_;
-public:
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  FusedBiasActFragmentIteratorTensorOp(AccumulatorTile &accum):
-    accumulators_(reinterpret_cast<AccessType *>(&accum)),
-    index_(0) {
-  }
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  FusedBiasActFragmentIteratorTensorOp &operator++() {
-    ++index_;
-    return *this;
-  }
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  FusedBiasActFragmentIteratorTensorOp &operator--() {
-    --index_;
-    return *this;
-  }
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag, int index_offset = 0) const {
-    int index = index_ + index_offset;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-      int accumulator_access_offset =
-        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
-      frag_ptr[n] = accumulators_[accumulator_access_offset];
-    }
-  }
-  /// Stores a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void store(Fragment &frag, int index_offset = 0) const {
-    int index = index_ + index_offset;
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
-      int accumulator_access_offset =
-        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
-      accumulators_[accumulator_access_offset] = frag_ptr[n];
-    }
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////
-} // namespace warp
-} // namespace epilogue
-} // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/fixed_impl/gemm/warp/mma_tensor_op_fragment_iterator_without_output_op.h DELETED Viewed

@@ -1,427 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_conversion.h"
-namespace cutlass {
-namespace gemm {
-namespace warp {
-////////////////////////////////////////////////////////////////////////////////
-template <
-    /// Size of the matrix to load (concept: MatrixShape)
-    typename Shape_,
-    /// Size of the accumulation tile shape (concept: MatrixShape)
-    typename AccumulatorShape_,
-    /// KBlocks columns to compute residual
-    int KBlocksColumn_,
-    /// Accumulator Element type
-    typename ElementAccumulator_,
-    /// Element type
-    typename Element_,
-    /// Layout of operand in memory
-    typename Layout_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_,
-    /// Whether beta is zero
-    bool IsBetaZero_ >
-class MmaTensorOpPureFragmentIterator;
-// Partial specialization for col-major accumulator tile
-// And Element type is the same as Accumulator Element type
-template <
-    /// Shape of warp tile to load (concept: MatrixShape)
-    typename Shape_,
-    /// Shape of the warp accumulation tile (concept: MatrixShape)
-    typename AccumulatorShape_,
-    /// KBlocks columns to compute residual
-    int KBlocksColumn_,
-    /// Element type
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_>
-class MmaTensorOpPureFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, Element_, Element_,
-                                         cutlass::layout::ColumnMajor,
-                                         InstructionShape_, true> {
- public:
-  /// Shape of warp tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-  /// Shape of the warp accumulation tile (concept: MatrixShape)
-  using AccumulatorShape = AccumulatorShape_;
-  /// KBlocks columns to compute residual
-  static int const kKBlockColumn = KBlocksColumn_;
-  /// Element type
-  using Element = Element_;
-  /// Layout of source tile
-  using Layout = cutlass::layout::ColumnMajor;
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-  /// Whether beta is zero
-  static bool const IsBetaZero = true;
-  /// Number of participating threads
-  static int const kThreads = 32;
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kRow % InstructionShape::kM) &&
-            !(Shape::kColumn % InstructionShape::kN),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-    static_assert(
-        !(AccumulatorShape::kRow % Shape::kRow) &&
-            !(AccumulatorShape::kColumn % Shape::kColumn),
-        "Shape of Warp Accumulator must be divisible by warp shape.");
-    static_assert(
-        !(kKBlockColumn % Shape::kColumn),
-        "KBlock size must be divisible by warp shape.");
-    /// Number of times this iterator can be incremented
-    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
-  };
-private:
-  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
-  /// Number of mma operations performed by a warp
-  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
-                                    Shape::kColumn / InstructionShape::kN>;
-  /// Number of mma operations performed by the entire accumulator
-  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
-                                              AccumulatorShape::kColumn / InstructionShape::kN>;
-  /// Number of K iterations
-  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
-  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
-  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn
-                                     * (AccumulatorShape::kRow / Shape::kRow);
-  static int const kResidualIndex = kResidualColumn / Shape::kColumn
-                                     * (AccumulatorShape::kRow / Shape::kRow);
-public:
-  //
-  // Derived quantities
-  //
-  /// Fragment object holding a thread's part of a tile
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<Element, Shape::kCount / kThreads>;
-  /// Accumulator Fragment object
-  using AccumulatorFragment = Array<Element, AccumulatorShape::kCount / kThreads>;
-private:
-  /// Internal access type
-  using AccessType = Array<Element, kElementsPerAccess>;
-private:
-  //
-  // Data members
-  //
-  /// Accumulator tile
-  AccessType const *accumulators_;
-  /// Internal index
-  int index_;
-  /// Used to access residual tile first
-  bool is_residual_tile_;
-public:
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpPureFragmentIterator(AccumulatorFragment const &accum)
-      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
-        index_(0), is_residual_tile_(true) {}
-  /// Add offset
-  CUTLASS_HOST_DEVICE
-  void add_offset(int index_offset) {
-    index_ += index_offset;
-    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
-      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
-      is_residual_tile_ = false;
-    }
-  }
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpPureFragmentIterator &operator++() {
-    add_offset(1);
-    return *this;
-  }
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpPureFragmentIterator &operator--() {
-    add_offset(-1);
-    return *this;
-  }
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    AccessType src_fragment;
-    src_fragment.clear();
-    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
-    int index_m = (index_ * MmaIterations::kRow) % AccumulatorIterations::kRow;
-    int index_n = (index_ * MmaIterations::kRow) / AccumulatorIterations::kRow
-                    * MmaIterations::kColumn;
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < MmaIterations::kColumn; n++) {
-      for (int m = 0; m < MmaIterations::kRow; m++) {
-        int accumulator_access_offset =
-            (n + index_n) * AccumulatorIterations::kRow + m + index_m;
-        frag_ptr[n * MmaIterations::kRow + m].clear();
-        if(!(is_residual_tile_ && index_ >= kResidualIndex))
-            frag_ptr[n * MmaIterations::kRow + m] = accumulators_[accumulator_access_offset];
-            // frag_ptr[n * MmaIterations::kRow + m] = output_op(accumulators_[accumulator_access_offset], src_fragment);
-      }
-    }
-  }
-};
-// Partial specialization for row-major accumulator tile
-template <
-    /// Shape of warp tile to load (concept: MatrixShape)
-    typename Shape_,
-    /// Shape of the warp accumulation tile (concept: MatrixShape)
-    typename AccumulatorShape_,
-    /// KBlocks columns to compute residual
-    int KBlocksColumn_,
-    /// Accumulator Element type
-    typename ElementAccumulator_,
-    /// Element type
-    typename Element_,
-    /// Shape of one matrix product operation (concept: MatrixShape)
-    typename InstructionShape_>
-class MmaTensorOpPureFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, ElementAccumulator_, Element_,
-                                         cutlass::layout::RowMajor,
-                                         InstructionShape_, true> {
- public:
-  /// Shape of warp tile to load (concept: MatrixShape)
-  using Shape = Shape_;
-  /// Shape of the warp accumulation tile (concept: MatrixShape)
-  using AccumulatorShape = AccumulatorShape_;
-  /// KBlocks columns to compute residual
-  static int const kKBlockColumn = KBlocksColumn_;
-  /// Accumulator Element type
-  using ElementAccumulator = ElementAccumulator_;
-  /// Element type
-  using Element = Element_;
-  /// Layout of source tile
-  using Layout = cutlass::layout::RowMajor;
-  /// Shape of one matrix product operation (concept: MatrixShape)
-  using InstructionShape = InstructionShape_;
-  /// Whether beta is zero
-  static bool const IsBetaZero = true;
-  /// Number of participating threads
-  static int const kThreads = 32;
-  /// Internal structure of iterator - made public to enable introspection
-  struct Policy {
-    static_assert(
-        !(Shape::kRow % InstructionShape::kM) &&
-            !(Shape::kColumn % InstructionShape::kN),
-        "Shape of warp-level Mma must be divisible by operator shape.");
-    static_assert(
-        !(AccumulatorShape::kRow % Shape::kRow) &&
-            !(AccumulatorShape::kColumn % Shape::kColumn),
-        "Shape of Warp Accumulator must be divisible by warp shape.");
-    static_assert(
-        !(kKBlockColumn % Shape::kColumn),
-        "KBlock size must be divisible by warp shape.");
-    /// Number of times this iterator can be incremented
-    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
-  };
-private:
-  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
-  /// Number of mma operations performed by a warp
-  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
-                                    Shape::kColumn / InstructionShape::kN>;
-  /// Number of mma operations performed by the entire accumulator
-  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
-                                              AccumulatorShape::kColumn / InstructionShape::kN>;
-  /// Number of K iterations
-  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
-  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
-  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn
-                                     * (AccumulatorShape::kRow / Shape::kRow);
-  static int const kResidualIndex = kResidualColumn / Shape::kColumn
-                                     * (AccumulatorShape::kRow / Shape::kRow);
-public:
-  //
-  // Derived quantities
-  //
-  /// Fragment object holding a thread's part of a tile
-  /// This is the fragment size produced by one access of the iterator.
-  using Fragment = Array<Element, Shape::kCount / kThreads>;
-  /// Accumulator Fragment object
-  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
-private:
-  /// Internal access type
-  using AccessType = Array<ElementAccumulator, kElementsPerAccess>;
-  using FragmentAccessType = Array<Element, kElementsPerAccess>;
-private:
-  //
-  // Data members
-  //
-  /// Accumulator tile
-  AccessType const *accumulators_;
-  /// Internal index
-  int index_;
-  /// Used to access residual tile first
-  bool is_residual_tile_;
-public:
-  /// Constructs an iterator
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpPureFragmentIterator(AccumulatorFragment const &accum)
-      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
-        index_(0), is_residual_tile_(true) {}
-  /// Add offset
-  CUTLASS_HOST_DEVICE
-  void add_offset(int index_offset) {
-    index_ += index_offset;
-    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
-      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
-      is_residual_tile_ = false;
-    }
-  }
-  /// Increments
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpPureFragmentIterator &operator++() {
-    add_offset(1);
-    return *this;
-  }
-  /// Decrements
-  CUTLASS_HOST_DEVICE
-  MmaTensorOpPureFragmentIterator &operator--() {
-    add_offset(-1);
-    return *this;
-  }
-  /// Loads a fragment from the referenced part of the accumulator tile
-  CUTLASS_HOST_DEVICE
-  void load(Fragment &frag) const {
-    FragmentAccessType src_fragment;
-    src_fragment.clear();
-    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
-    int index_m = (index_ * MmaIterations::kRow) % AccumulatorIterations::kRow;
-    int index_n = (index_ * MmaIterations::kRow) / AccumulatorIterations::kRow
-                    * MmaIterations::kColumn;
-    CUTLASS_PRAGMA_UNROLL
-    for (int m = 0; m < MmaIterations::kRow; m++) {
-      for (int n = 0; n < MmaIterations::kColumn; n++) {
-        int accumulator_access_offset =
-            (m + index_m) * AccumulatorIterations::kColumn + n + index_n;
-        frag_ptr[m * MmaIterations::kColumn + n].clear();
-        if(!(is_residual_tile_ && index_ >= kResidualIndex))
-           frag_ptr[m * MmaIterations::kColumn + n] = (accumulators_[accumulator_access_offset]);
-      }
-    }
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-} // namespace warp
-} // namespace gemm
-} // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_all_code.py DELETED Viewed

@@ -1,129 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-import gen_turing_and_volta as api_generator
-import gen_sample as sample_creater
-import gen_cmake as cmake_creater
-import gen_verify as verify_creater
-import gen_device as b2b_fused_generator
-import replace_fix_impl_header
-import argparse
-import os
-import json
-parser = argparse.ArgumentParser(description="Generates Fused Multi-GEMM CUTLASS Kernels")
-parser.add_argument("--config-file", default="config.json", help="JSON file containing configuration to generate")
-parser.add_argument("--gen-name", default="FusedMultiGemmForward", help="Specific the output name")
-parser.add_argument("--output-dir", default="", help="Specifies the output dir")
-parser.add_argument("--cutlass-dir", default="", help="Specifies the dependent CUTLASS repo dir")
-parser.add_argument("--gen-include-cutlass-dir", default="", help="Specifies the generated CUTLASS code include dir, if needed.")
-args = parser.parse_args()
-gen_name = args.gen_name
-cutlass_deps_dir = args.cutlass_dir
-output_dir = args.output_dir
-output_dir += "/"
-cutlass_deps_root = args.gen_include_cutlass_dir
-if cutlass_deps_root == '':
-    cutlass_deps_root = cutlass_deps_dir + "/include/"
-cutlass_deps_root +='/'
-if not os.path.exists(output_dir):
-    os.makedirs(output_dir)
-if not os.path.exists(output_dir + "/" + "auto_gen"):
-    os.mkdir(output_dir + "/" + "auto_gen")
-if not os.path.exists(output_dir + "/" + "fixed_impl"):
-    os.mkdir(output_dir + "/" + "fixed_impl" )
-if not os.path.exists(output_dir + "/" + "sample"):
-    os.mkdir(output_dir + "/" + "sample" )
-if not os.path.exists(output_dir + "/" + "auto_gen" + "/" + "device"):
-    os.mkdir(output_dir + "/" + "auto_gen" + "/" + "device")
-if not os.path.exists(output_dir + "/" + "auto_gen" + "/" + "kernel"):
-    os.mkdir(output_dir + "/" + "auto_gen" + "/" + "kernel")
-if not os.path.exists(output_dir + "/" + "auto_gen" + "/" + "threadblock"):
-    os.mkdir(output_dir + "/" + "auto_gen" + "/" + "threadblock")
-with open(args.config_file, 'r') as infile:
-    gemm_info_dict = json.load(infile)
-keys = sorted(gemm_info_dict.keys())
-fuse_gemm_info = [gemm_info_dict[k] for k in keys]
-for_cutlass_gen_user_include_header_file = [
-    cutlass_deps_root + "cutlass/epilogue/thread/linear_combination_leaky_relu.h",
-    cutlass_deps_root + "cutlass/epilogue/thread/linear_combination.h",
-]
-for_fused_wrapper = [
-    cutlass_deps_root + "cutlass/epilogue/thread/linear_combination_leaky_relu.h",
-    cutlass_deps_root + "cutlass/epilogue/thread/linear_combination.h",
-    "auto_gen/device/" + gen_name + ".h",
-    cutlass_deps_root + "cutlass/gemm/device/gemm_batched.h",
-    cutlass_deps_root + "cutlass/cutlass.h",
-]
-# Copy fixed implementation to the output directory
-fix_impl = replace_fix_impl_header.replace_fix_impl("../fixed_impl/", output_dir +"/fixed_impl/", cutlass_deps_root)
-fix_impl.gen_code()
-auto_gen_output_dir = output_dir + "/auto_gen/"
-project_root = ""
-turing_plus = b2b_fused_generator.gen_device(fuse_gemm_info, gen_name, for_cutlass_gen_user_include_header_file, cutlass_deps_root, project_root, auto_gen_output_dir)
-turing_plus.gen_code(75, 'hmma1688', False)
-api = api_generator.gen_one_API(fuse_gemm_info, gen_name, for_fused_wrapper, output_dir)
-api.gen_code()
-# Generate C++ sample
-os.system("cp ../leaky_bias.h " + output_dir + "/sample/")
-os.system("cp ../utils.h " + output_dir + "/sample/")
-sample_dir = output_dir + "/sample/"
-sample = sample_creater.gen_test(fuse_gemm_info, gen_name, for_cutlass_gen_user_include_header_file, sample_dir)
-sample.gen_cpp_sample()
-cmake_gen = cmake_creater.gen_build_sys(cutlass_deps_dir, output_dir)
-cmake_gen.gen_code()
-verify = verify_creater.gen_verify(fuse_gemm_info, gen_name, for_fused_wrapper, output_dir)
-verify.gen_code()

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_cmake.py DELETED Viewed

@@ -1,131 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-class gen_build_sys:
-    def __init__(self, cutlass_deps_dir, output_dir = "../"):
-        self.output_dir = output_dir
-        self.cutlass_deps_dir = cutlass_deps_dir
-    def gen_top(self):
-        code = ""
-        code += '''\
-# Auto Generated code - Do not edit.
-cmake_minimum_required(VERSION 3.8)
-project(CUTLASS_MULTI_GEMMS LANGUAGES CXX CUDA)
-find_package(CUDAToolkit)
-set(CUDA_PATH ${{CUDA_TOOLKIT_ROOT_DIR}})
-set(CUTLASS_PATH \"{cutlass_deps_dir}/include\")
-set(CUTLASS_UTIL_PATH \"{cutlass_deps_dir}/tools/util/include\")
-list(APPEND CMAKE_MODULE_PATH ${{CUDAToolkit_LIBRARY_DIR}})
-'''.format(cutlass_deps_dir=self.cutlass_deps_dir)
-        code += '''\
-set(GPU_ARCHS \"\" CACHE STRING
-  \"List of GPU architectures (semicolon-separated) to be compiled for.\")
-if(\"${GPU_ARCHS}\" STREQUAL \"\")
-	set(GPU_ARCHS \"70\")
-endif()
-foreach(arch ${GPU_ARCHS})
-  set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} -gencode arch=compute_${arch},code=sm_${arch}\")
-	if(SM STREQUAL 70 OR SM STREQUAL 75)
-    set(CMAKE_C_FLAGS    \"${CMAKE_C_FLAGS}    -DWMMA\")
-    set(CMAKE_CXX_FLAGS  \"${CMAKE_CXX_FLAGS}  -DWMMA\")
-    set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} -DWMMA\")
-	endif()
-endforeach()
-set(CMAKE_C_FLAGS    \"${CMAKE_C_FLAGS}\")
-set(CMAKE_CXX_FLAGS  \"${CMAKE_CXX_FLAGS}\")
-set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS}  -Xcompiler -Wall\")
-set(CMAKE_C_FLAGS_DEBUG    \"${CMAKE_C_FLAGS_DEBUG}    -Wall -O0\")
-set(CMAKE_CXX_FLAGS_DEBUG  \"${CMAKE_CXX_FLAGS_DEBUG}  -Wall -O0\")
-set(CMAKE_CUDA_FLAGS_DEBUG \"${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall\")
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-if(CMAKE_CXX_STANDARD STREQUAL \"11\")
-  set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} --expt-extended-lambda\")
-  set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr\")
-endif()
-set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS} -g -O3\")
-set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} -Xcompiler -O3\")
-set(CMAKE_CUDA_FLAGS \"${CMAKE_CUDA_FLAGS} -Xcompiler=-fno-strict-aliasing\")
-set(COMMON_HEADER_DIRS
-  ${PROJECT_SOURCE_DIR}
-  ${CUDAToolkit_INCLUDE_DIRS}
-)
-set(COMMON_LIB_DIRS
-  ${CUDAToolkit_LIBRARY_DIR}
-)
-list(APPEND COMMON_HEADER_DIRS ${CUTLASS_PATH})
-list(APPEND COMMON_HEADER_DIRS ${CUTLASS_UTIL_PATH})
-'''
-        code += '''\
-include_directories(
-  ${COMMON_HEADER_DIRS}
-)
-link_directories(
-  ${COMMON_LIB_DIRS}
-)
-add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
-add_definitions(-DGOOGLE_CUDA=1)
-add_executable(sample
-  sample/sample.cu
-  one_api.cu
-)
-target_link_libraries(sample PRIVATE
-  -lcudart
-  -lnvToolsExt
-  ${CMAKE_THREAD_LIBS_INIT}
-)
-if(NOT DEFINED LIB_INSTALL_PATH)
-	set(LIB_INSTALL_PATH ${CMAKE_CURRENT_BINARY_DIR})
-endif()
-'''
-        return code
-    def gen_code(self):
-        top_code = self.gen_top()
-        with open(self.output_dir + "CMakeLists.txt", "w") as f:
-            f.write(top_code)

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_customized_epilogue.py DELETED Viewed

@@ -1,120 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-import ast
-fuse_gemm_info = [
-    {
-    'epilogue': {
-        'tp': 'LeakyRelu', #'CustomizedLeaky_RELU'
-        'bias': {'addbias': False, 'bias_tp': 'mat'},
-        'args': [('float', 'leaky_alpha', 1.3), ],
-        'func': '''
-y = max(leaky_alpha * x, x)
-y = y * x
-    '''
-        }
-    },
-]
-class AnalysisNodeVisitor(ast.NodeVisitor):
-    def visit_Import(self,node):
-        ast.NodeVisitor.generic_visit(self, node)
-    def visit_ImportFrom(self,node):
-        ast.NodeVisitor.generic_visit(self, node)
-    def visit_Assign(self,node):
-        print('Node type: Assign and fields: ', node._fields)
-        # print('Node type: Assign and targets value: ', node.targets, node.value)
-        ast.NodeVisitor.generic_visit(self, node)
-    def visit_BinOp(self, node):
-        print('Node type: BinOp and fields: ', node._fields)
-        print('node op: ', type(node.op).__name__)
-        ast.NodeVisitor.generic_visit(self, node)
-    def visit_Expr(self, node):
-        print('Node type: Expr and fields: ', node._fields)
-        ast.NodeVisitor.generic_visit(self, node)
-    def visit_Num(self,node):
-        print('Node type: Num and fields: ', node._fields)
-        print('Node type: Num: ', node.n)
-    def visit_Name(self,node):
-        print('Node type: Name and fields: ', node._fields)
-        print('Node type: Name and fields: ', type(node.ctx).__name__, node.id)
-        ast.NodeVisitor.generic_visit(self, node)
-    def visit_Str(self, node):
-        print('Node type: Str and fields: ', node._fields)
-class CodeVisitor(ast.NodeVisitor):
-    def visit_BinOp(self, node):
-        if isinstance(node.op, ast.Add):
-            node.op = ast.Sub()
-            self.generic_visit(node)
-    def visit_Assign(self, node):
-        print('Assign %s' % node.value)
-        self.generic_visit(node)
-    def visit_Name(self, node):
-        print("Name:", node.id)
-        self.generic_visit(node)
-    def visit_FunctionDef(self, node):
-        print('Function Name:%s'% node.name.op)
-        self.generic_visit(node)
-        func_log_stmt = ast.Print(
-            dest = None,
-            values = [ast.Str(s = 'calling func: %s' % node.name, lineno = 0, col_offset = 0)],
-            nl = True,
-            lineno = 0,
-            col_offset = 0,
-        )
-        node.body.insert(0, func_log_stmt)
-visitor = AnalysisNodeVisitor()
-code = \
-'''
-a=max(leaky_alpha * x, x +1)
-'''
-visitor.visit(ast.parse(code))

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py DELETED Viewed

@@ -1,469 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-from typing import *
-import helper
-import gen_ir
-import gen_kernel as gen_ker
-class gen_device:
-    def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, cutlass_deps_root, project_root, output_dir = "../"):
-        self.fuse_gemm_info = fuse_gemm_info
-        self.raw_gemm_info = fuse_gemm_info
-        self.b2b_num = len(fuse_gemm_info)
-        self.user_header_file = user_header_file
-        self.args = {}
-        # device arg struct memebr
-        self.arg_member = []
-        self.gen_class_name = gen_class_name
-        self.gen_kernel_name = gen_class_name + "Kernel"
-        self.template_args = []
-        self.__tempalate_arg_list = {'Stages': int, 'SplitKSerial': bool, 'IsBetaZero': bool, 'AlignmentA': int, 'AlignmentB': int}
-        self.file_name = output_dir + "/device/" +gen_class_name +".h"
-        self.sample_dir = output_dir
-        self.cutlass_deps_root = cutlass_deps_root
-        self.project_root = project_root
-        self.this_file_root = output_dir + "/device/"
-        self.first_use_1stage = False
-        ## gen kernel
-        self.gen_kernel = gen_ker.gen_kernel(self.template_args, self.gen_class_name, self.b2b_num, output_dir, cutlass_deps_root, project_root)
-    def __check_arg_type(self, temp_arg):
-        if temp_arg in self.__tempalate_arg_list.keys():
-            return self.__tempalate_arg_list[temp_arg]
-        find_sub = False
-        for candidate_arg in self.__tempalate_arg_list.keys():
-            if (temp_arg.find(candidate_arg) != -1):
-                return self.__tempalate_arg_list[candidate_arg]
-        return 'typename'
-    # def gen_B2b2bGemm_class():
-    def set_arch(self, sm_cap, mma_tp):
-        if sm_cap == 75 or sm_cap == 80 or sm_cap == 86:
-            self.arch = "cutlass::arch::Sm" + str(sm_cap)
-        if mma_tp is 'hmma1688':
-            self.mma_shape = [16, 8, 8]
-            self.mma_tp = 'hmma'
-        elif mma_tp is 'imma8816':
-            self.mma_tp = 'imma'
-            self.mma_shape = [8, 8, 16]
-        else:
-            return 0
-    def gen_include_header(self):
-        code = '''\
-/* Auto Generated code - Do not edit.*/
-#pragma once
-#include \"{cutlass_root}cutlass/cutlass.h\"
-#include \"{cutlass_root}cutlass/numeric_types.h\"
-#include \"{cutlass_root}cutlass/arch/arch.h\"
-#include \"{cutlass_root}cutlass/device_kernel.h\"
-#include \"{cutlass_root}cutlass/gemm/threadblock/threadblock_swizzle.h\"
-#include \"{cutlass_root}cutlass/gemm/device/default_gemm_configuration.h\"
-#include \"{cutlass_root}cutlass/epilogue/thread/linear_combination_relu.h\"
-#include \"{cutlass_root}cutlass/epilogue/thread/linear_combination.h\"
-#include \"{project_root}../kernel/b2b_gemm.h\"
-#include \"{project_root}../kernel/default_b2b_gemm.h\"
-'''.format(cutlass_root=self.cutlass_deps_root, project_root=self.project_root, this_file_root=self.this_file_root)
-        include_user_header = ""
-        for header in self.user_header_file:
-            include_user_header += "#include \"" + header + "\"\n"
-        return code + include_user_header
-    def gen_code(self, sm_cap, mma_tp, ifprint = True):
-        self.set_arch(sm_cap, mma_tp)
-        self.update_b2b_args()
-        print(self.fuse_gemm_info)
-        self.update_b2b_class_template_args()
-        func_code = self.gen_all_func()
-        member_var_code = "private:\n typename B2bGemmKernel::Params params_;\n"
-        gen_code = gen_ir.gen_template_class(self.gen_class_name, self.template_args, func_code + member_var_code)
-        code = self.gen_include_header() + gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("device", gen_code)))
-        if ifprint:
-            print(code)
-        print("[INFO]: Gen device code output Dir: is ", self.file_name)
-        with open(self.file_name, 'w+') as f:
-            f.write(code)
-        gen_kernel = self.gen_kernel.gen_code(self.first_use_1stage)
-        print(gen_kernel)
-    def update_b2b_class_template_args(self):
-        for arg in self.args.keys():
-            self.template_args.append([self.__check_arg_type(arg), arg, self.args[arg]])
-    def update_b2b_args(self):
-        self.args['ElementA'] = helper.type_2_cutlass_type(self.fuse_gemm_info[0]['A_tp'])
-        self.args['LayoutA'] = helper.type_2_cutlass_type(self.fuse_gemm_info[0]['A_format'])
-        cnt = 0
-        warp_M_tile = 32
-        # Determine maximum N_tile
-        Max_Ntile = 0
-        for layer in self.fuse_gemm_info:
-            n_tile = layer['mnk'][1]
-            if n_tile > Max_Ntile:
-                Max_Ntile = n_tile
-        if Max_Ntile >= 256:
-            warp_M_tile = 16
-        stages_temp = []
-        for layer in self.fuse_gemm_info:
-            cnt_str = str(cnt)
-            B_tp_str= 'ElementB' + cnt_str
-            B_format_str = 'LayoutB' + cnt_str
-            C_tp_str= 'ElementC' + cnt_str
-            C_format_str = 'LayoutC' + cnt_str
-            Acc_str = 'ElementAccumulator' + cnt_str
-            self.args[B_tp_str] = helper.type_2_cutlass_type(layer['B_tp'])
-            self.args[B_format_str] = helper.type_2_cutlass_type(layer['B_format'])
-            self.args[C_tp_str] = helper.type_2_cutlass_type(layer['C_tp'])
-            self.args[C_format_str] = helper.type_2_cutlass_type(layer['C_format'])
-            self.args[Acc_str] = helper.type_2_cutlass_type(layer['Acc_tp'])
-            mnk = layer['mnk'][:]
-            tile_mnk = mnk[:]
-            tile_mnk[2] = 32 # force the ktile is 32
-            #N tile gen
-            if mnk[1] > 1024:
-                assert(0)
-            elif mnk[1] > 512:
-                tile_mnk[1] = 1024
-            elif mnk[1] > 256:
-                tile_mnk[1] = 512
-            elif mnk[1] > 128:
-                tile_mnk[1] = 256
-            elif mnk[1] > 64:
-                tile_mnk[1] = 128
-            elif mnk[1] > 32:
-                tile_mnk[1] = 64
-            else :
-                tile_mnk[1] = 32
-            if tile_mnk[1] == 512:
-                stages_temp.append(1)
-            else:
-                stages_temp.append(2)
-            tile_mnk[0] = 4 * warp_M_tile
-            epilogue_setted_type = helper.get_epilogue_tp(layer)
-            cutlass_epilogue_name = "LinearCombinationRelu"
-            if epilogue_setted_type.lower() == 'leakyrelu':
-                cutlass_epilogue_name = "LinearCombinationLeakyRelu"
-            elif epilogue_setted_type.lower() == 'identity':
-                cutlass_epilogue_name = "LinearCombination"
-            epilogue_str = 'EpilogueOutputOp' + cnt_str
-            if cnt != len(self.fuse_gemm_info) - 1:
-                n = layer['mnk'][1]
-                Fragments = tile_mnk[1] // 8 * 2
-                self.args[epilogue_str] = "cutlass::epilogue::thread::" + cutlass_epilogue_name + "<ElementC0_, " + str(Fragments) +", ElementAccumulator0_, ElementAccumulator0_>"
-            else:
-                n = layer['mnk'][1]
-                n_mod_8 = n % 4
-                N_align_elements = 1
-                if n_mod_8 == 0:
-                    N_align_elements = 8
-                elif n_mod_8 == 4:
-                    N_align_elements = 4
-                elif n_mod_8 == 2 or n_mod_8 == 6:
-                    N_align_elements = 2
-                self.args[epilogue_str] = "cutlass::epilogue::thread::" + cutlass_epilogue_name+ "<ElementC0_, " + str(N_align_elements) + ", ElementAccumulator0_, ElementAccumulator0_>"
-            ThreadBlockShape_str = 'ThreadblockShape' + cnt_str
-            self.args[ThreadBlockShape_str] = helper.cvt_2_cutlass_shape(tile_mnk)
-            WarpShape_str = 'WarpShape' + cnt_str
-            tile_mnk[0] = warp_M_tile
-            self.args[WarpShape_str] = helper.cvt_2_cutlass_shape(tile_mnk)
-            cnt += 1
-        self.args['ElementD'] = helper.type_2_cutlass_type(self.fuse_gemm_info[self.b2b_num - 1]['C_tp'])
-        self.args['LayoutD'] = helper.type_2_cutlass_type(self.fuse_gemm_info[self.b2b_num - 1]['C_format'])
-        self.args['InstructionShape'] = helper.cvt_2_cutlass_shape(self.mma_shape)
-        self.args['OperatorClass'] = 'arch::OpClassTensorOp'
-        self.args['ArchTag'] = self.arch
-        self.args['ThreadblockSwizzle'] = 'threadblock::GemmBatchedIdentityThreadblockSwizzle'
-        for i in range(self.b2b_num):
-            self.args[helper.var_idx('Stages', i)] = "2"
-        self.args['AlignmentA'] = str(8)
-        self.args['AlignmentB'] = str(8)
-        self.args['SplitKSerial'] = 'false'
-        self.args['Operator'] = 'typename DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB0_, ElementC0_, ElementAccumulator0_>::Operator'
-        self.args['IsBetaZero'] = 'false'
-    def gen_using_kernel(self):
-        code = "using B2bGemmKernel = typename kernel::DefaultB2bGemm<\n"
-        code += "    " + "ElementA,\n"
-        code += "    " + "LayoutA,\n"
-        for i in range(self.b2b_num):
-            code += "    " + helper.var_idx("ElementB", i) + ",\n"
-            code += "    " + helper.var_idx("LayoutB", i) + ",\n"
-            code += "    " + helper.var_idx("ElementC", i) + ",\n"
-            code += "    " + helper.var_idx("LayoutC", i) + ",\n"
-            code += "    " + helper.var_idx("ElementAccumulator", i) + ",\n"
-            code += "    " + helper.var_idx("EpilogueOutputOp", i) + ",\n"
-            code += "    " + helper.var_idx("ThreadblockShape", i) + ",\n"
-            code += "    " + helper.var_idx("WarpShape", i) + ",\n"
-        code +=  "    " + "ElementD,\n"
-        code +=  "    " + "LayoutD,\n"
-        code +=  "    " + "InstructionShape,\n"
-        code +=  "    " + "OperatorClass,\n"
-        code +=  "    " + "ArchTag,\n"
-        code +=  "    " + "ThreadblockSwizzle,\n"
-        for i in range(self.b2b_num):
-            code +=  "    " + helper.var_idx("Stages", i) + ",\n"
-        code +=  "    " + "AlignmentA,\n"
-        code +=  "    " + "AlignmentB,\n"
-        code +=  "    " + "SplitKSerial,\n"
-        code +=  "    " + "Operator,\n"
-        code +=  "    " + "IsBetaZero_\n"
-        code += ">::B2bGemmKernel;\n\n"
-        return code
-    def gen_args(self):
-        def gen_arg_member(b2b_num):
-            data_members = []
-            for i in range(b2b_num):
-                member_type = "GemmCoord"
-                member_name = "problem_size_" + str(i)
-                data_members.append((member_type, member_name))
-            member_type = "TensorRef<ElementA const, LayoutA>"
-            member_name = "ref_A0"
-            data_members.append((member_type, member_name))
-            for i in range(b2b_num):
-                member_type = "TensorRef<ElementB" + str(i) + " const, LayoutB" + str(i) +">"
-                member_name = "ref_B" + str(i)
-                data_members.append((member_type, member_name))
-                member_type = "TensorRef<ElementC" + str(i) + " const, LayoutC" + str(i) +">"
-                member_name = "ref_C" + str(i)
-                data_members.append((member_type, member_name))
-            member_type = "TensorRef<ElementD, LayoutD>"
-            member_name = helper.var_idx("ref_D", b2b_num - 1)
-            data_members.append((member_type, member_name))
-            for i in range(b2b_num):
-                member_type = "typename EpilogueOutputOp" + str(i) + "::Params"
-                member_name = "epilogue" + str(i)
-                data_members.append((member_type, member_name))
-            data_members.append(('int', 'batch_count'))
-            return data_members
-        def gen_arg_struct_default_ctor(struct_name, data_members, inital_param_num, inital_value):
-            constructs_code = gen_ir.indentation + "CUTLASS_HOST_DEVICE\n" + \
-                              gen_ir.indentation + struct_name + " (): "
-            for i in range(inital_param_num):
-                final_param = ','
-                if i == inital_param_num - 1:
-                    final_param = '{ }'
-                constructs_code +=  data_members[i][1] + inital_value + final_param
-            constructs_code += "\n"
-            return constructs_code
-        def gen_arg_struct_ctor(struct_name, data_members):
-            constructs_code = gen_ir.indentation + "CUTLASS_HOST_DEVICE\n" + \
-                              gen_ir.indentation + struct_name + " (\n"
-            cnt = 0
-            param_num = len(data_members)
-            for param in data_members:
-                final = ',\n'
-                if cnt == param_num - 1:
-                    final = '\n):\n'
-                constructs_code +=  gen_ir.indentation + param[0] + " " + param[1] + "_" + final
-                cnt += 1
-            cnt = 0
-            for param in data_members:
-                final = '),\n'
-                if cnt == param_num - 1:
-                    final = ") { }\n"
-                constructs_code +=  gen_ir.indentation + param[1] + "(" + param[1] + "_" + final
-                cnt += 1
-            constructs_code += "\n"
-            return constructs_code
-        # (variable type, variable name)
-        struct_member = gen_arg_member(self.b2b_num)
-        self.arg_member = struct_member
-        codeBody = ""
-        for each_member in struct_member:
-            codeBody += gen_ir.indentation + each_member[0] + " " + each_member[1] + ";\n"
-        codeBody += gen_arg_struct_default_ctor("Arguments", struct_member, self.b2b_num, "(0,0,0)") + "\n"
-        codeBody += gen_arg_struct_ctor("Arguments", struct_member) + "\n"
-        struct_code = gen_ir.gen_struct("Arguments", codeBody)
-        return struct_code
-    def gen_func_constructs(self):
-        code = self.gen_class_name +"() {}"
-        return code
-    def gen_func_initialize(self):
-        code = "Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {\n" + \
-                "// Determine grid shape\n" + \
-                "ThreadblockSwizzle threadblock_swizzle;\n" + \
-                "cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(\n" + \
-                "  args.problem_size_0, \n" + \
-                "  { ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK },\n" + \
-                "  args.batch_count);\n" + \
-                "// Initialize the Params structure\n" + \
-                "params_ = typename B2bGemmKernel::Params{\n"
-        for i in range(self.b2b_num):
-            code += helper.var_idx("  args.problem_size_", i) + ",\n"
-        code += "  grid_shape,\n" + \
-                "  args.ref_A0.non_const_ref(),\n"
-        for i in range(self.b2b_num):
-            code += helper.var_idx("  args.ref_B", i) + ".non_const_ref(),\n"
-            code += helper.var_idx("  args.ref_C", i) + ".non_const_ref(),\n"
-        code += helper.var_idx("  args.ref_D", self.b2b_num - 1) + ",\n"
-        for i in range(self.b2b_num):
-            code += helper.var_idx("  args.epilogue", i) + ",\n"
-        code += "  args.batch_count\n"
-        code += "};\n" + \
-                "return Status::kSuccess;\n" + \
-                "}\n"
-        return code
-    def gen_func_run(self):
-        code = "Status run(cudaStream_t stream = nullptr) {\n" + \
-                "\n" + \
-                "  ThreadblockSwizzle threadblock_swizzle;\n" + \
-                "\n" + \
-                "  dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);\n" + \
-                "  dim3 block(B2bGemmKernel::kThreadCount, 1, 1);\n" + \
-                "\n" + \
-                "  cudaError_t result;\n" + \
-                "\n" + \
-                "  int smem_size = int(sizeof(typename B2bGemmKernel::SharedStorage));\n" + \
-                "  if (smem_size >= (48 << 10)) {\n" + \
-                "    result = cudaFuncSetAttribute(Kernel<B2bGemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);\n" + \
-                "\n" + \
-                "    if (result != cudaSuccess) {\n" + \
-                "      return Status::kErrorInternal;\n" + \
-                "    }\n" + \
-                "  }\n" + \
-                "  cutlass::Kernel<B2bGemmKernel><<<grid, block, smem_size, stream>>>(params_);\n" + \
-                "  result = cudaGetLastError();\n" + \
-                "  return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;\n" + \
-                "  }\n"
-        return code
-    def gen_func_operator(self):
-        opeartor_with_arg_code = "Status operator()(\n" + \
-                                "  Arguments const &args,\n" + \
-                                "  void *workspace = nullptr,\n" + \
-                                "  cudaStream_t stream = nullptr) {\n" + \
-                                "  Status status = initialize(args, workspace);\n" + \
-                                "  \n" + \
-                                "  if (status == Status::kSuccess) {\n" + \
-                                "    status = run(stream);\n" + \
-                                "  }\n" + \
-                                "  return status;\n" + \
-                                "}\n"
-        operator_code = "Status operator()(\n" + \
-                        "  cudaStream_t stream = nullptr) {\n" + \
-                        "   Status status = run(stream);\n" + \
-                        "   return status;\n" + \
-                        "}\n"
-        return opeartor_with_arg_code + "\n" + operator_code
-    def gen_all_func(self):
-        return  self.gen_using_kernel() + "\n" + \
-                self.gen_args() + "\n" + \
-                self.gen_func_constructs()  + "\n" + \
-                self.gen_func_initialize() + "\n" + \
-                self.gen_func_run() + "\n" + \
-                self.gen_func_operator()

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_ir.py DELETED Viewed

@@ -1,249 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-import helper
-indentation = "    "
-def append_word(word):
-    code = ""
-    code += word
-    code += " "
-    return code
-def gen_namespace(namespace, codeBody):
-    code_gen = "namespace " + namespace + " {\n"
-    code_gen += codeBody
-    code_gen += "} // namespace " + namespace + "\n"
-    return code_gen
-def gen_expression(type, lval, rval = None):
-    code_gen = ""
-    code_gen += append_word(type)
-    code_gen += append_word(lval)
-    if rval is not None:
-        code_gen += append_word("=")
-        code_gen += append_word(rval)
-    return code_gen
-def gen_class(name, codeBody, inheritance_code = None):
-    code_gen = ""
-    if inheritance_code is None:
-        code_gen = "class " + name + "{\n"
-    else:
-        code_gen = "class " + name + " : "+ inheritance_code + "{\n"
-    code_gen += codeBody
-    code_gen += "}; // class " + name + "\n"
-    return code_gen
-def gen_struct(name, codeBody, specialized = None):
-    specialized_code = ""
-    if specialized is not None:
-        specialized_code = "<" + specialized + ">"
-    code_gen = "struct " + name + specialized_code + "{\n"
-    code_gen += codeBody
-    code_gen += "}; // struct " + name + "\n"
-    return code_gen
-def gen_template_arg(arg_type, arg_name, default_val = None):
-    rval = None
-    if default_val is not None:
-        rval = str(default_val)
-    arg_typename = ""
-    if arg_type is int:
-        arg_typename = "int"
-    elif arg_type is bool:
-        arg_typename = "bool"
-    else:
-        arg_typename = "typename"
-    internal_arg_name = arg_name + "_"
-    code_gen = indentation
-    code_gen += gen_expression(arg_typename, internal_arg_name, rval)
-    return code_gen
-def gen_template_args(args, set_default = True):
-    arg_len = len(args)
-    cnt = 1
-    code_gen = ""
-    for arg_tuple in args:
-        arg_type = arg_tuple[0]
-        arg_name = arg_tuple[1]
-        arg_default_val = None
-        if len(arg_tuple) == 3 and set_default:
-            arg_default_val = arg_tuple[2]
-        code_gen += gen_template_arg(arg_type, arg_name, arg_default_val)
-        if cnt != arg_len:
-            code_gen += ",\n"
-        cnt += 1
-    return code_gen
-def gen_template_head(args, set_default = True):
-    code_gen = "template <\n"
-    code_gen += gen_template_args(args, set_default)
-    code_gen += ">\n"
-    return code_gen
-def export_template_args(args):
-    code_gen = "public:\n"
-    for arg_tuple in args:
-        code_gen += indentation
-        arg_type = arg_tuple[0]
-        arg_name = arg_tuple[1]
-        internal_arg_name = arg_name + "_"
-        typename = ""
-        if arg_type is int:
-            typename = "static int const"
-        elif arg_type is bool:
-            typename = "static bool const"
-        else:
-            typename = "using"
-        code_gen += gen_expression(typename, arg_name, internal_arg_name)
-        code_gen += ";\n"
-    return code_gen
-def gen_template_class(class_name, args, codeBody, set_default = True, inheritance_code = None):
-    code_gen = ""
-    code_gen += gen_template_head(args, set_default)
-    code_gen += gen_class(class_name, export_template_args(args) + codeBody, inheritance_code)
-    return code_gen
-def gen_template_struct(struct_name, args, codeBody, speicalized = None, set_default = True, export_args = True):
-    code_gen = ""
-    code_gen += gen_template_head(args, set_default)
-    code = export_template_args(args) + codeBody
-    if export_args is False:
-        code = codeBody
-    code_gen += gen_struct(struct_name, code , speicalized)
-    return code_gen
-def gen_declare_template_struct(name, *params):
-    code = name + "<"
-    cnt = 0
-    param_num = len(params)
-    for param in params:
-        final = ", "
-        if cnt == param_num - 1:
-            final = ""
-        code += param + final
-        cnt += 1
-    code += ">;\n"
-    return code
-def filtered_param(params, name_and_value_pair, keep_ = False):
-    rtn_template_args = []
-    speicalized_template_args = []
-    for param in params:
-        param_name = ""
-        if len(param) >= 1:
-            param_name = param[1]
-        else:
-            param_name = param[0]
-        hit_flag = False
-        set_value = ""
-        for n_v_pair in name_and_value_pair:
-            filter_name = n_v_pair[0]
-            set_value = n_v_pair[1]
-            if param_name == (filter_name + "_") or param_name == filter_name :
-                hit_flag = True
-                break
-        if hit_flag is False:
-            rtn_template_args.append(param)
-        if hit_flag is True:
-            speicalized_template_args.append(set_value)
-        else:
-            if keep_ is True:
-                speicalized_template_args.append(param_name + "_")
-            else:
-                speicalized_template_args.append(param_name)
-    specialized_template_arg_str = helper.list_2_string(speicalized_template_args)
-    return rtn_template_args, specialized_template_arg_str
-def gen_func(func_name, arg_lists, code_body, only_declare = False, with_cudaStream = True):
-    code = "void " + func_name + "(\n"
-    for arg in arg_lists:
-        arg_tp = arg[0]
-        arg_nm = arg[1]
-        code += "    " + arg_tp + " " + arg_nm + ",\n"
-    code += "cudaStream_t stream)"
-    if only_declare :
-        return code
-    code += "{\n"
-    code += code_body + "\n"
-    code += "}\n"
-    return code
-def indent_level(code, level = 0):
-    rtn_code = ""
-    for i in range(level):
-        rtn_code += "    "
-    rtn_code += code
-    return rtn_code

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py DELETED Viewed

@@ -1,476 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-import gen_ir
-import helper
-import gen_threadblock as gen_tb
-class gen_default_Gemm:
-    def __init__(self, template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root):
-        self.gen_class_name = "B2bGemm"
-        self.template_param = template_param
-        self.b2b_num = b2b_num
-        self.cutlass_deps_root = cutlass_deps_root
-        self.project_root = project_root
-    def gen_B2bMma(self, specialized_template_args):
-        code = "using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<\n"
-        code += specialized_template_args
-        code += ">::ThreadblockB2bMma;\n"
-        # print(code)
-        return code
-    def gen_epilogue(self):
-        epilogue_code = ""
-        epilogue_code += helper.var_idx("static const int kPartitionsK", self.b2b_num - 1) + helper.var_idx(" = ThreadblockShape", self.b2b_num - 1) + helper.var_idx("::kK / WarpShape", self.b2b_num - 1) + "::kK;\n"
-        epilogue_code += "using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<\n"
-        epilogue_code += "    " + helper.var_idx("ThreadblockShape", self.b2b_num - 1) + ",\n"
-        epilogue_code += "    " + helper.var_idx("typename B2bMma::Operator", self.b2b_num - 1) + ",\n"
-        epilogue_code += "    " + helper.var_idx("kPartitionsK", self.b2b_num - 1) + ",\n"
-        epilogue_code += "    " + helper.var_idx("EpilogueOutputOp", self.b2b_num - 1) + ",\n"
-        epilogue_code += "    " + helper.var_idx("EpilogueOutputOp", self.b2b_num - 1) + "::kCount\n"
-        epilogue_code += ">::Epilogue;\n"
-        epilogue_code += "using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;\n\n"
-        return epilogue_code
-    def gen_include_header(self):
-        code = '''
-/* Auto Generated code - Do not edit.*/
-#pragma once
-#include \"{cutlass_dir}cutlass/cutlass.h\"
-#include \"{cutlass_dir}cutlass/layout/matrix.h\"
-#include \"{cutlass_dir}cutlass/numeric_types.h\"
-#include \"{cutlass_dir}cutlass/epilogue/threadblock/epilogue.h\"
-#include \"{cutlass_dir}cutlass/epilogue/thread/linear_combination.h\"
-#include \"{cutlass_dir}cutlass/gemm/gemm.h\"
-#include \"{cutlass_dir}cutlass/gemm/kernel/gemm_pipelined.h\"
-#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_sm75.h\"
-#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_sm70.h\"
-#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_sm80.h\"
-#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_simt.h\"
-#include \"{cutlass_dir}cutlass/gemm/threadblock/threadblock_swizzle.h\"
-#include \"{cutlass_dir}cutlass/epilogue/threadblock/default_epilogue_tensor_op.h\"
-#include \"{cutlass_dir}cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h\"
-#include \"{cutlass_dir}cutlass/epilogue/threadblock/default_epilogue_simt.h\"
-#include \"{cutlass_dir}cutlass/transform/threadblock/predicated_tile_iterator.h\"
-#include \"../kernel/b2b_gemm.h\"
-#include \"../threadblock/default_b2b_mma.h\"
-'''.format(cutlass_dir=self.cutlass_deps_root)
-        return code
-    def gen_code(self):
-        gen_using = ''
-        # Generate default template struct
-        gen_code = gen_ir.gen_template_struct("Default" + self.gen_class_name, self.template_param,"", speicalized = None, set_default=False)
-        filter_list = []
-        filter_list.append(('Stages', 2))
-        filter_list.append(("OperatorClass", "arch::OpClassTensorOp"))
-        filter_list.append(("ArchTag", "arch::Sm75"))
-        for i in range(self.b2b_num):
-            filter_list.append((helper.var_idx("LayoutC", i), "layout::RowMajor"))
-        rtn_template_args, speicalized_template_args = gen_ir.filtered_param(self.template_param, filter_list, keep_= True)
-        B2bMma_code = self.gen_B2bMma(speicalized_template_args)
-        epilogue_and_rest_code = self.gen_epilogue()
-        gen_special_code = gen_ir.gen_template_struct("Default" + self.gen_class_name, rtn_template_args, B2bMma_code + epilogue_and_rest_code, speicalized = speicalized_template_args, set_default=False)
-        code = gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("kernel", gen_code + gen_special_code)))
-        return self.gen_include_header() + code
-class gen_Kernel:
-    def __init__(self, template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root):
-        self.gen_class_name = "B2bGemm"
-        self.template_param = template_param
-        self.b2bnum = b2b_num
-        self.cutlass_deps_root = cutlass_deps_root
-        self.project_root = project_root
-    def gen_include_header(self):
-        code = '''
-#pragma once
-#include \"{cutlass_dir}cutlass/cutlass.h\"
-#include \"{cutlass_dir}cutlass/gemm/gemm.h\"
-#include \"{cutlass_dir}cutlass/matrix_coord.h\"\n'''.format(cutlass_dir=self.cutlass_deps_root)
-        return code
-    def gen_Params(self):
-        gen_param = ""
-        for i in range(self.b2bnum):
-            gen_param += "    " + helper.var_idx("cutlass::gemm::GemmCoord problem_size_", i) + ";\n"
-        gen_param += "    " + "cutlass::gemm::GemmCoord grid_tiled_shape;\n"
-        gen_param += "    " + "typename B2bMma::IteratorA0::Params params_A0;\n"
-        gen_param += "    " + "typename B2bMma::IteratorA0::TensorRef ref_A0;\n"
-        for i in range(self.b2bnum):
-            gen_param += "    " + helper.var_idx("typename B2bMma::IteratorB", i) + helper.var_idx("::Params params_B", i) + ";\n"
-            gen_param += "    " + helper.var_idx("typename B2bMma::IteratorB", i) + helper.var_idx("::TensorRef ref_B", i) + ";\n"
-            if i == self.b2bnum - 1:
-                gen_param += "    " + helper.var_idx("typename Epilogue::OutputTileIterator::Params params_C", i) + ";\n"
-                gen_param += "    " + helper.var_idx("typename Epilogue::OutputTileIterator::TensorRef ref_C", i) + ";\n"
-            else:
-                gen_param += "    " + helper.var_idx("typename FusedAddBiasEpilogue", i) + helper.var_idx("::OutputTileIterator::Params params_C", i) + ";\n"
-                gen_param += "    " + helper.var_idx("typename FusedAddBiasEpilogue", i) + helper.var_idx("::OutputTileIterator::TensorRef ref_C", i) + ";\n"
-        gen_param += "    " + helper.var_idx("typename Epilogue::OutputTileIterator::Params params_D", self.b2bnum - 1) + ";\n"
-        gen_param += "    " + helper.var_idx("typename Epilogue::OutputTileIterator::TensorRef ref_D", self.b2bnum - 1) + ";\n"
-        for i in range(self.b2bnum):
-            gen_param += "    " + helper.var_idx("typename OutputOp", i) + helper.var_idx("::Params output_op_", i) + ";\n"
-        gen_param += "    " + 'int batch_count' + ";\n"
-        gen_param += "    " + 'int gemm_k_iterations_0' + ";\n"
-        return gen_param
-    def gen_Memberfunc(self):
-        code_default = "\nCUTLASS_HOST_DEVICE\n"
-        code_default += "Params()"
-        code_default += " { } \n\n"
-        code_construct = "\nCUTLASS_HOST_DEVICE\n"
-        code_construct += "Params(\n"
-        for i in range(self.b2bnum):
-            code_construct += "    " + helper.var_idx("cutlass::gemm::GemmCoord const & problem_size_", i) + ",\n"
-        code_construct += "    " + "cutlass::gemm::GemmCoord const & grid_tiled_shape,\n"
-        code_construct += "    " + "typename B2bMma::IteratorA0::TensorRef ref_A0,\n"
-        for i in range(self.b2bnum):
-            code_construct += "    " + helper.var_idx("typename B2bMma::IteratorB", i) + helper.var_idx("::TensorRef ref_B", i) + ",\n"
-            if i == self.b2bnum - 1:
-                code_construct += "    " + helper.var_idx("typename Epilogue::OutputTileIterator::TensorRef ref_C", i) + ",\n"
-            else:
-                code_construct += "    " + helper.var_idx("typename FusedAddBiasEpilogue", i) + helper.var_idx("::OutputTileIterator::TensorRef ref_C", i) + ",\n"
-        code_construct += "    " + helper.var_idx("typename Epilogue::OutputTileIterator::TensorRef ref_D", self.b2bnum - 1) + ",\n"
-        for i in range(self.b2bnum):
-            code_construct += "    " + helper.var_idx("typename OutputOp", i) + helper.var_idx("::Params output_op_", i) + helper.var_idx(" = typename OutputOp", i) + "::Params(),\n"
-        code_construct += "    " + "int batch_count = 1\n"
-        code_construct += "):\n"
-        for i in range(self.b2bnum):
-            code_construct += "    " + helper.var_idx("problem_size_", i) + helper.var_idx("(problem_size_", i) + "),\n"
-        code_construct += "    " + "grid_tiled_shape(grid_tiled_shape),\n"
-        code_construct += "    " + "params_A0(ref_A0.layout()),\n"
-        code_construct += "    " + "ref_A0(ref_A0),\n"
-        for i in range(self.b2bnum):
-            code_construct += "    " + helper.var_idx("params_B", i) + helper.var_idx("(ref_B", i) + ".layout()),\n"
-            code_construct += "    " + helper.var_idx("ref_B", i) + helper.var_idx("(ref_B", i) + "),\n"
-            code_construct += "    " + helper.var_idx("params_C", i) + helper.var_idx("(ref_C", i) + ".layout()),\n"
-            code_construct += "    " + helper.var_idx("ref_C", i) + helper.var_idx("(ref_C", i) + "),\n"
-        code_construct += "    " + helper.var_idx("params_D", self.b2bnum - 1) + helper.var_idx("(ref_D", self.b2bnum - 1) + ".layout()),\n"
-        code_construct += "    " + helper.var_idx("ref_D", self.b2bnum - 1) + helper.var_idx("(ref_D", self.b2bnum - 1) + "),\n"
-        for i in range(self.b2bnum):
-            code_construct += "    " + helper.var_idx("output_op_", i) + helper.var_idx("(output_op_", i) + "), \n"
-        code_construct += "    " + "batch_count(batch_count) {\n"
-        code_construct += "    " + helper.var_idx("gemm_k_iterations_", 0) + helper.var_idx(" = (problem_size_", 0) + helper.var_idx(".k() + B2bMma::Shape", 0) + helper.var_idx("::kK - 1) / B2bMma::Shape", 0) + "::kK;\n"
-        code_construct += "}\n"
-        return code_default + code_construct
-    def gen_using(self):
-        code_using = ""
-        for i in range(self.b2bnum - 1):
-            code_using += "    " + helper.var_idx("using OutputOp", i) +  helper.var_idx(" = typename B2bMma::OutputOp", i) + ";\n"
-        code_using += "    " + helper.var_idx("using OutputOp", self.b2bnum - 1) + " = typename Epilogue::OutputOp;\n"
-        for i in range(self.b2bnum - 1):
-            code_using += "    " + helper.var_idx("using FusedAddBiasEpilogue", i) + helper.var_idx(" = typename B2bMma::FusedAddBiasEpilogue", i) +";\n"
-        code_using += "    "  + "using WarpCount0 = typename B2bMma::WarpCount0;\n"
-        code_using += "    "  + "static int const kThreadCount = 32 * WarpCount0::kCount;\n"
-        code_using += gen_ir.gen_struct("Params", self.gen_Params() + self.gen_Memberfunc())
-        code_using += "union SharedStorage {\n"
-        code_using += "    " + "typename B2bMma::B2bMmaSharedStorage main_loop;\n"
-        code_using += "    " + "typename Epilogue::SharedStorage epilogue;\n"
-        code_using += "};\n"
-        return code_using
-    def gen_can_implement(self):
-        gen_code = ""
-        return gen_code
-    def gen_operator_and_constr(self):
-        ctr_code = "CUTLASS_HOST_DEVICE\n"
-        ctr_code += self.gen_class_name + "() { } \n\n"
-        operator_code = "CUTLASS_DEVICE\n"
-        operator_code += "void operator()(Params const &params, SharedStorage &shared_storage) {\n"
-        operator_code += "    " + "ThreadblockSwizzle threadblock_swizzle;\n"
-        operator_code += "    " + "cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);\n"
-        operator_code += "    " + "int batch_idx = threadblock_tile_offset.k();\n"
-        operator_code += "    " + "if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||\n"
-        operator_code += "    " + "params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {\n"
-        operator_code += "    " + "    " + "return;\n"
-        operator_code += "    " + "}\n"
-        operator_code += "    " + "cutlass::MatrixCoord tb_offset_A0{\n"
-        operator_code += "    " + "    " + "threadblock_tile_offset.m() * B2bMma::Shape0::kM,\n"
-        operator_code += "    " + "    " + "0\n"
-        operator_code += "    " + "};\n"
-        for i in range(self.b2bnum):
-            operator_code += "    " + helper.var_idx("cutlass::MatrixCoord tb_offset_B", i) + "{\n"
-            operator_code += "    " + "    " + "0,\n"
-            operator_code += "    " + "    " + helper.var_idx("threadblock_tile_offset.n() * B2bMma::Shape", i) + "::kN\n"
-            operator_code += "    " + "};\n"
-        operator_code += "    " + "int thread_idx = threadIdx.x;\n\n"
-        operator_code += "    " + "MatrixCoord threadblock_offset(\n"
-        operator_code += "    " + "    " + helper.var_idx("threadblock_tile_offset.m() * B2bMma::Shape", self.b2bnum - 1) + "::kM,\n"
-        operator_code += "    " + "    " + helper.var_idx("threadblock_tile_offset.n() * B2bMma::Shape", self.b2bnum - 1) + "::kN\n"
-        operator_code += "    " + ");\n"
-        operator_code += "    " + "typename B2bMma::IteratorA0 iterator_A0(\n"
-        operator_code += "    " + "    " + "params.params_A0,\n"
-        operator_code += "    " + "    " + "params.ref_A0.data(),\n"
-        operator_code += "    " + "    " + "params.problem_size_0.mk(),\n"
-        operator_code += "    " + "    " + "thread_idx,\n"
-        operator_code += "    " + "    " + "tb_offset_A0);\n"
-        operator_code += "    " + "iterator_A0.add_pointer_offset(batch_idx * params.problem_size_0.m() * params.problem_size_0.k());\n\n"
-        for i in range (self.b2bnum):
-            operator_code += "    " + helper.var_idx("typename B2bMma::IteratorB", i ) + helper.var_idx(" iterator_B", i) + "(\n"
-            operator_code += "    " + "    " + helper.var_idx("params.params_B", i) + ",\n"
-            operator_code += "    " + "    " + helper.var_idx("params.ref_B", i) + ".data(),\n"
-            operator_code += "    " + "    " + helper.var_idx("params.problem_size_", i) + ".kn(),\n"
-            operator_code += "    " + "    " + "thread_idx,\n"
-            operator_code += "    " + "    " + helper.var_idx("tb_offset_B", i) + ");\n"
-            operator_code += "    " + helper.var_idx("iterator_B", i) + helper.var_idx(".add_pointer_offset(batch_idx * params.problem_size_", i) + helper.var_idx(".n() * params.problem_size_", i) + ".k());\n\n"
-        for i in range (self.b2bnum - 1):
-            operator_code += "    " + helper.var_idx("typename FusedAddBiasEpilogue", i ) + helper.var_idx("::OutputTileIterator iterator_C", i) + "(\n"
-            operator_code += "    " + "    " + helper.var_idx("params.params_C", i) + ",\n"
-            operator_code += "    " + "    " + helper.var_idx("params.ref_C", i) + ".data(),\n"
-            operator_code += "    " + "    " + helper.var_idx("params.problem_size_" , i) + ".mn(),\n"
-            operator_code += "    " + "    " + "thread_idx,\n"
-            operator_code += "    " + "    " + "threadblock_offset" + ");\n"
-            operator_code += "    " + helper.var_idx("int ref_C", i) + helper.var_idx("_stride = params.ref_C", i) + ".stride()[0];\n"
-            operator_code += "    " + helper.var_idx("iterator_C", i) + helper.var_idx(".add_pointer_offset(batch_idx * params.problem_size_", i) + helper.var_idx(".n() * (ref_C", i) + helper.var_idx("_stride == 0 ? 1 : params.problem_size_", i) + ".m()));\n\n"
-        for i in range (self.b2bnum - 1):
-            operator_code += "    " + helper.var_idx("FusedAddBiasEpilogue", i ) + helper.var_idx(" epilogue_", i ) + ";\n"
-        operator_code += "    " + "int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);\n"
-        operator_code += "    " + "int lane_idx = threadIdx.x % 32;\n"
-        for i in range (self.b2bnum - 1):
-            operator_code += "    " + helper.var_idx("OutputOp", i) + helper.var_idx(" output_op_", i) + helper.var_idx("(params.output_op_", i) + ");\n"
-        operator_code += "    " + "B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);\n"
-        operator_code += "    " + "typename B2bMma::FragmentC0 src_accum;\n"
-        operator_code += "    " + helper.var_idx("typename B2bMma::FragmentC", self.b2bnum - 1)+ " accumulators;\n"
-        operator_code += "    " + "src_accum.clear();\n"
-        operator_code += "    " + "accumulators.clear();\n"
-        operator_code += "    " + "b2bMma(params.gemm_k_iterations_0, accumulators, iterator_A0, "
-        for i in range(self.b2bnum):
-            operator_code += helper.var_idx("iterator_B", i) + ", "
-        operator_code += "src_accum"
-        if self.b2bnum != 1:
-            operator_code += ", "
-        for i in range(self.b2bnum - 1):
-            operator_code += helper.var_idx("output_op_", i) + ", "
-        for i in range(self.b2bnum - 1):
-            operator_code += helper.var_idx("epilogue_", i) + ", "
-        for i in range(self.b2bnum - 1):
-            final = ", "
-            if i == self.b2bnum - 2:
-                final =""
-            operator_code += helper.var_idx("iterator_C", i) + final
-        operator_code += ");\n"
-        operator_code += "    " + helper.var_idx("OutputOp", self.b2bnum - 1) + helper.var_idx(" output_op_", self.b2bnum - 1) + helper.var_idx("(params.output_op_", self.b2bnum - 1) + ");\n"
-        operator_code += "    " + "threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);\n"
-        operator_code += "    " + helper.var_idx("typename Epilogue::OutputTileIterator iterator_C", self.b2bnum - 1) + "(\n"
-        operator_code += "    " + "    " + helper.var_idx("params.params_C", self.b2bnum - 1) + ",\n"
-        operator_code += "    " + "    " + helper.var_idx("params.ref_C", self.b2bnum - 1) + ".data(),\n"
-        operator_code += "    " + "    " + helper.var_idx("params.problem_size_", self.b2bnum - 1) + ".mn(),\n"
-        operator_code += "    " + "    " + "thread_idx,\n"
-        operator_code += "    " + "    " + "threadblock_offset\n"
-        operator_code += "    " + ");\n"
-        operator_code += "    " + helper.var_idx("int ref_C", self.b2bnum - 1) + helper.var_idx("_stride = params.ref_C", self.b2bnum - 1) + ".stride()[0];\n"
-        operator_code += "    " + helper.var_idx("iterator_C", self.b2bnum - 1) + helper.var_idx(".add_pointer_offset(batch_idx * params.problem_size_", self.b2bnum - 1) + helper.var_idx(".n() * (ref_C", self.b2bnum - 1) + helper.var_idx("_stride == 0 ? 1 : params.problem_size_", self.b2bnum - 1) + ".m()));\n\n"
-        operator_code += "    " + helper.var_idx("typename Epilogue::OutputTileIterator iterator_D", self.b2bnum - 1) + "(\n"
-        operator_code += "    " + "    " + helper.var_idx("params.params_D", self.b2bnum - 1) + ",\n"
-        operator_code += "    " + "    " + helper.var_idx("params.ref_D", self.b2bnum - 1) + ".data(),\n"
-        operator_code += "    " + "    " + helper.var_idx("params.problem_size_", self.b2bnum - 1) + ".mn(),\n"
-        operator_code += "    " + "    " + "thread_idx,\n"
-        operator_code += "    " + "    " + "threadblock_offset\n"
-        operator_code += "    " + ");\n"
-        operator_code += "    " + helper.var_idx("iterator_D", self.b2bnum - 1) + helper.var_idx(".add_pointer_offset(batch_idx * params.problem_size_", self.b2bnum - 1) + helper.var_idx(".n() * params.problem_size_", self.b2bnum - 1) + ".m());\n\n"
-        operator_code += "    " + "Epilogue epilogue(\n"
-        operator_code += "    " + "    " + "shared_storage.epilogue,\n"
-        operator_code += "    " + "    " + "thread_idx,\n"
-        operator_code += "    " + "    " + "warp_idx,\n"
-        operator_code += "    " + "    " + "lane_idx\n"
-        operator_code += "    " + ");\n"
-        operator_code += "    " + "epilogue("
-        operator_code += helper.var_idx("output_op_", self.b2bnum - 1) + ", "
-        operator_code += helper.var_idx("iterator_D", self.b2bnum - 1) + ", "
-        operator_code += "accumulators, "
-        operator_code += helper.var_idx("iterator_C", self.b2bnum - 1) + ");\n"
-        operator_code += "}\n"
-        return ctr_code + operator_code
-    def gen_include_header(self):
-        code = '''
-#pragma once
-#include \"{cutlass_dir}cutlass/cutlass.h\"
-#include \"{cutlass_dir}cutlass/gemm/gemm.h\"
-#include \"{cutlass_dir}cutlass/matrix_coord.h\"
-#include \"{cutlass_dir}cutlass/semaphore.h\"
-'''.format(cutlass_dir=self.cutlass_deps_root)
-        return code
-    def gen_code(self):
-        template_param = []
-        template_param.append(("typename", "B2bMma"))
-        template_param.append(("typename", "Epilogue"))
-        template_param.append(("typename", "ThreadblockSwizzle"))
-        template_param.append((bool, "SplitKSerial"))
-        code_body = ""
-        code_body += self.gen_using()
-        code_body += self.gen_operator_and_constr()
-        struct_code = gen_ir.gen_template_struct(self.gen_class_name, template_param, code_body)
-        code = self.gen_include_header()
-        code += gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("kernel", struct_code)))
-        return self.gen_include_header() + code
-class gen_kernel:
-    def __init__(self, template_param, gen_class_name, b2b_num, output_dir, cutlass_deps_root, project_root):
-        self.template_param = template_param
-        self.gen_class_name = "B2bGemm"
-        self.gen_kernel_name = gen_class_name + "Kernel"
-        self.template_args = []
-        self.cutlass_deps_root = cutlass_deps_root
-        self.project_root = project_root
-        self.gen_default_b2b_gemm = gen_default_Gemm(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
-        self.gen_Kerenl = gen_Kernel(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
-        # Include gen_threadBlock
-        self.gen_threadBlock = gen_tb.gen_threadblock(template_param, gen_class_name, b2b_num, output_dir, cutlass_deps_root, project_root)
-        self.file_dir = output_dir + "/kernel/"
-    def gen_code(self, first_use_1stage):
-        default_b2b_gemm = self.gen_default_b2b_gemm.gen_code()
-        print("[INFO]: Gen kernel code [default_b2b_gemm.h]output Dir: is ", self.file_dir)
-        with open(self.file_dir + "default_b2b_gemm.h", "w+") as f:
-            f.write(default_b2b_gemm)
-        kernel = self.gen_Kerenl.gen_code()
-        print("[INFO]: Gen kernel code [b2b_gemm.h]output Dir: is ", self.file_dir)
-        with open(self.file_dir + "b2b_gemm.h", "w+") as f:
-            f.write(kernel)
-        # Call code to gen threadblock
-        self.gen_threadBlock.gen_code(first_use_1stage)

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_sample.py DELETED Viewed

@@ -1,232 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-import helper
-import gen_ir as ir
-class gen_test:
-    def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, output_dir = "../"):
-        self.fuse_gemm_info = fuse_gemm_info
-        self.gen_class_name = gen_class_name
-        self.user_header_file = user_header_file
-        self.sample_dir = output_dir
-        self.b2b_num = len(fuse_gemm_info)
-    def gen_cpp_sample(self):
-        code = "/* Auto Generated code - Do not edit.*/\n"
-        code +=  "#include <cstdio> \n"
-        code += "#include \"cutlass/gemm/device/gemm_batched.h\" \n"
-        code += "#include \"cutlass/cutlass.h\" \n"
-        code += "#include \"../cutlass_irrelevant.h\" \n"
-        code += "#include \"../cutlass_verify.h\" \n"
-        code += "#include \"leaky_bias.h\" \n"
-        code +=  "#include \"utils.h\" \n"
-        code += "int main(int args, char * argv[]) {\n"
-        code += "    " + "int M = atoi(argv[1]);\n"
-        code += "    " + "int K0 = " + str(self.fuse_gemm_info[0]['mnk'][0]) + ";\n"
-        code += "    " + "if(args == 3);\n"
-        code += "    " + "    " + "K0 = atoi(argv[2]);\n"
-        code += "    " + "int B = 1;\n"
-        code += "    " + "if(args == 4);\n"
-        code += "    " + "    " + "B = atoi(argv[3]);\n"
-        code += "    " + "srand(1234UL);\n"
-        code += "    " + "int device_id = 0;\n"
-        code += "    " + "cudaGetDevice(&device_id);\n"
-        code += "    " + "cudaDeviceProp prop;\n"
-        code += "    " + "cudaGetDeviceProperties(&prop, device_id);\n"
-        code += "    " + "int sm = prop.major *10 + prop.minor;\n"
-        code += "using ElementCompute = cutlass::half_t;\n"
-        for i in range(self.b2b_num):
-            code += "    " + helper.var_idx("ElementCompute alpha", i) + " = ElementCompute(1);\n"
-            addbias = helper.get_epilogue_add_bias_or_not( self.fuse_gemm_info[i])
-            if addbias:
-                code += "    " + helper.var_idx("ElementCompute beta", i) + " = ElementCompute(1);\n"
-            else:
-                code += "    " + helper.var_idx("ElementCompute beta", i) + " = ElementCompute(0);\n"
-        code += "    " + "size_t flops = 0;\n"
-        for i in range(self.b2b_num):
-            m = self.fuse_gemm_info[i]['mnk'][0]
-            n = self.fuse_gemm_info[i]['mnk'][1]
-            k = self.fuse_gemm_info[i]['mnk'][2]
-            bias_shape = helper.get_epilogue_bias_shape(self.fuse_gemm_info[i])
-            this_k = "K0"
-            if (i > 0):
-                this_k = str(k)
-            code += "    " + "flops += size_t(2) * size_t(M) * size_t(B) * " + "size_t(" + str(n) + ") * size_t(" + this_k + ");\n"
-            code += "    " + helper.var_idx("cutlass::gemm::GemmCoord problem_size_", i) + "(" + "M" + ", " + str(n) + ", " + this_k + ");\n"
-            code += "    " + helper.var_idx("memory_unit<cutlass::half_t> Mat_A", i) + helper.var_idx("(B * problem_size_", i) + helper.var_idx(".m() * problem_size_", i) + ".k());\n"
-            code += "    " + helper.var_idx("memory_unit<cutlass::half_t> Mat_B", i) + helper.var_idx("(B * problem_size_", i) + helper.var_idx(".n() * problem_size_", i) + ".k());\n"
-            code += "    " + helper.var_idx("memory_unit<cutlass::half_t> Mat_C", i) + "(B * " + str(bias_shape[0]) + " * " + str(bias_shape[1]) + ");\n"
-            code += "    " + helper.var_idx("memory_unit<cutlass::half_t> Mat_D_cutlass_ref", i) + helper.var_idx("(B * problem_size_", i) + helper.var_idx(".m() * problem_size_", i) + ".n());\n"
-            code += "    " + helper.var_idx("Mat_A", i) + ".init();\n"
-            code += "    " + helper.var_idx("Mat_B", i) + ".init();\n"
-            code += "    " + helper.var_idx("Mat_C", i) + ".init();\n"
-        code += "    " + helper.var_idx("memory_unit<cutlass::half_t> Mat_D", self.b2b_num - 1) +  helper.var_idx("(B * problem_size_", i) + helper.var_idx(".m() * problem_size_",self.b2b_num - 1) + ".n());\n"
-        params = []
-        params.append("M")
-        params.append("B")
-        params.append("Mat_A0.device_ptr")
-        for i in range(self.b2b_num):
-            params.append(helper.var_idx("Mat_B", i) + ".device_ptr")
-            params.append(helper.var_idx("Mat_C", i) + ".device_ptr")
-            if i != self.b2b_num-1:
-                params.append(helper.var_idx("Mat_D_cutlass_ref", i) + ".device_ptr")
-        params.append(helper.var_idx("Mat_D", self.b2b_num - 1) + ".device_ptr")
-        code += "    " + "Param arguments = {\n"
-        code += "    " + "    " + "M,\n"
-        code += "    " + "    " + "K0,\n"
-        code += "    " + "    " + "B,\n"
-        code += "    " + "    " + "reinterpret_cast<const void*>(Mat_A0.device_ptr),\n"
-        cnt = 1
-        for i in range(self.b2b_num):
-            bias_flag = helper.get_epilogue_add_bias_or_not( self.fuse_gemm_info[i])
-            code += "    " + "    " + "reinterpret_cast<const void*>(" + helper.var_idx("Mat_B", i) + ".device_ptr" + "),\n"
-            cnt += 1
-            if bias_flag:
-                code += "    " + "    " + "reinterpret_cast<const void*>(" + helper.var_idx("Mat_C", i) + ".device_ptr" + "),\n"
-                cnt += 1
-            else:
-                code += "    " + "    " + "reinterpret_cast<const void*>(NULL),\n"
-            epilogue_args = helper.get_epilogue_args(self.fuse_gemm_info[i])
-            acc_tp = helper.get_epilogue_compute_tp(self.fuse_gemm_info[i])
-            for arg in epilogue_args:
-                arg_value = str(arg[2])
-                code +=  "    " + "    " + helper.type_2_cutlass_type(acc_tp)  + "(" + arg_value + "),\n"
-            if i != self.b2b_num - 1:
-                code += "    " + "    " + "reinterpret_cast<void*>(" + helper.var_idx("Mat_D_cutlass_ref", i) + ".device_ptr" + "),\n"
-            else:
-                code += "    " + "    " + "reinterpret_cast<void*>(" + helper.var_idx("Mat_D", i) + ".device_ptr" + ")};\n"
-        code += "    " + "TI(FUSED_CUTLASS);\n"
-        code += "    " + "for(int i = 0; i < 100; i++){\n"
-        code += "    " + "    " + "one_api(arguments, sm, NULL);\n"
-        code += "    " + "}\n"
-        code += "    " + "TO(FUSED_CUTLASS, \"FUSED_CUTLASS\", 100);\n"
-        code += "\n"
-        for i in range(self.b2b_num):
-            code_this = ""
-            N_str = str(self.fuse_gemm_info[i]['mnk'][1])
-            code_this += "    " + helper.var_idx("typename Gemm", i) + helper.var_idx("::Arguments arguments_", i) + "{\n"
-            code_this += "    " + "    " + helper.var_idx("problem_size_", i) + ",\n"
-            ldmA = str(self.fuse_gemm_info[i]['mnk'][2])
-            if i == 0:
-                ldmA = "K0"
-            ldmB = str(self.fuse_gemm_info[i]['mnk'][2])
-            if i == 0:
-                ldmB = "K0"
-            ldmC = str(self.fuse_gemm_info[i]['mnk'][1])
-            ldmBias = str(helper.get_epilogue_bias_ldm(self.fuse_gemm_info[i]))
-            if self.fuse_gemm_info[i]['A_format'] is 'Col':
-                ldmA = "M"
-            if self.fuse_gemm_info[i]['B_format'] is 'Row':
-                ldmB = str(self.fuse_gemm_info[i]['mnk'][1])
-            if self.fuse_gemm_info[i]['C_format'] is 'Col':
-                ldmC = "M"
-            if i == 0:
-                code_this += "    " + "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_tp']) + "*>(" + helper.var_idx("Mat_A", i) + ".device_ptr), " + ldmA + "}, " + "M * " + ldmA + ",\n"
-            else:
-                code_this += "    " + "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_tp']) + "*>(" + helper.var_idx("Mat_D_cutlass_ref", i - 1) + ".device_ptr), " + ldmA + "}, " + "M * " + ldmA + ",\n"
-            code_this += "    " + "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['B_tp']) + "*>(" + helper.var_idx("Mat_B", i) + ".device_ptr), " + ldmB + "}, " + N_str + " * " + ldmB + ",\n"
-            M_bias = str(helper.get_epilogue_bias_shape(self.fuse_gemm_info[i])[0])
-            code_this += "    " + "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + "*>(" + helper.var_idx("Mat_C", i) + ".device_ptr), " + ldmBias + "}, " + M_bias + " * " + N_str + ",\n"
-            code_this += "    " + "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + "*>(" + helper.var_idx("Mat_D_cutlass_ref", i) + ".device_ptr), " + ldmC + "}, " + "M * " + ldmC + ",\n"
-            code_this += "    " + "    " + "{ " + helper.var_idx("alpha", i) + ", " + helper.var_idx("beta", i)
-            for epilogue_arg in  helper.get_epilogue_args(self.fuse_gemm_info[i]):
-                arg_value = str(epilogue_arg[2])
-                code_this += ", " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + "(" + str(arg_value) + ")"
-            code_this += "    " + " },\n"
-            code_this += "    " + "    " + "B};\n"
-            code += code_this
-        code += "    " + "TI(UNFUSED_CUTLASS);\n"
-        code += "    " + "for(int i = 0; i < 100; i++){\n"
-        code += "    " + "    " + self.gen_class_name + "_verify(\n"
-        for i in range(self.b2b_num):
-            code += "    " + "    " + "    " + helper.var_idx("arguments_", i) + ",\n"
-        code += "    " + "    " + "    " + "NULL);\n"
-        code += "    " + "}\n"
-        code += "    " + "TO(UNFUSED_CUTLASS, \"UNFUSED_CUTLASS\", 100);\n"
-        code += "    " + helper.var_idx("Mat_D_cutlass_ref", self.b2b_num - 1) + ".d2h();\n"
-        code += "    " + helper.var_idx("Mat_D", self.b2b_num - 1) + ".d2h();\n"
-        code += "    " + helper.var_idx("check_result(Mat_D_cutlass_ref", self.b2b_num - 1) + helper.var_idx(".host_ptr, Mat_D", self.b2b_num - 1) \
-                       + helper.var_idx(".host_ptr, Mat_D", self.b2b_num - 1) + ".elements);\n"
-        code += "\n\n}\n"
-        with open(self.sample_dir + "sample.cu", "w+") as f:
-            f.write(code)

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py DELETED Viewed

@@ -1,1013 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-import gen_ir
-import helper
-class gen_default_b2b_mma:
-    def __init__(self, template_param, gen_class_name, b2b_num,cutlass_deps_root, project_root):
-        self.gen_class_name = "DefaultB2bMma"
-        self.template_param = template_param
-        self.b2b_num = b2b_num
-        self.cutlass_deps_root = cutlass_deps_root
-        self.project_root = project_root
-    def gen_include_header(self):
-        code = '''
-/* Auto Generated code - Do not edit.*/
-#pragma once
-#include \"{cutlass_dir}cutlass/cutlass.h\"
-#include \"{cutlass_dir}cutlass/numeric_types.h\"
-#include \"{cutlass_dir}cutlass/arch/arch.h\"
-#include \"{cutlass_dir}cutlass/transform/threadblock/predicated_tile_iterator.h\"
-#include \"{cutlass_dir}cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h\"
-#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_sm70.h\"
-#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_sm75.h\"
-#include \"{cutlass_dir}cutlass/gemm/threadblock/default_mma_core_sm80.h\"
-#include \"../threadblock/b2b_mma_pipelined.h\"
-#include \"../../fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h\"
-#include \"../../fixed_impl/epilogue/threadblock/default_bias_act_epilogue_tensor_op.h\"
-#include \"../../fixed_impl/gemm/warp/mma_tensor_op_fragment_iterator_without_output_op.h\"
-'''.format(cutlass_dir=self.cutlass_deps_root)
-        return code
-    def gen_using_MmaCore(self, stage):
-        threadBlockShape = "ThreadblockShape"
-        warpShape = "WarpShape"
-        instrunctionShape = "InstructionShape"
-        Mma_typename = "typename cutlass::gemm::threadblock::DefaultMmaCore"
-        gen_code = ""
-        for i in range(self.b2b_num):
-            code_using = "using MmaCore" + str(i)
-            gen_code += code_using + " = " + gen_ir.gen_declare_template_struct(Mma_typename, \
-                                                helper.var_idx(threadBlockShape, i), helper.var_idx(warpShape, i), instrunctionShape, \
-                                                "ElementA", "LayoutA", \
-                                                helper.var_idx("ElementB", i), helper.var_idx("LayoutB", i), \
-                                                helper.var_idx("ElementAccumulator", i), "layout::RowMajor", \
-                                                "OperatorClass", str(stage), "Operator")
-        return gen_code
-    def gen_using_FusedAddBiasEpilogue(self):
-        gen_code = ""
-        for i in range(self.b2b_num - 1):
-            code_using = helper.var_idx("using FusedAddBiasEpilogue", i)
-            epilogue_name = "typename cutlass::epilogue::threadblock::DefaultFusedBiasActEpilogueTensorOp"
-            template_args = helper.var_idx("<ThreadblockShape", i) + helper.var_idx(",typename MmaCore", i) + helper.var_idx("::MmaPolicy::Operator, 1, EpilogueOutputOp", i) + ", 2>::Epilogue"
-            gen_code += code_using + " = " + epilogue_name + template_args + ";\n"
-        return gen_code
-    def gen_using_Iterator(self):
-        code_using = "using IteratorA0"
-        iterator_typename = "cutlass::transform::threadblock::PredicatedTileIterator"
-        MmaCore = "MmaCore0"
-        matrix_shape = "cutlass::MatrixShape<" + MmaCore + "::Shape::kM, " + MmaCore + "::Shape::kK>"
-        iterator_map = "typename " + MmaCore + "::IteratorThreadMapA"
-        gen_code = code_using + " = " + gen_ir.gen_declare_template_struct(iterator_typename, \
-                                                matrix_shape, "ElementA", "LayoutA", "1", iterator_map, "AlignmentA_")
-        for i in range(self.b2b_num):
-            code_using = "using IteratorB" + str(i)
-            iterator_typename = "cutlass::transform::threadblock::PredicatedTileIterator"
-            MmaCore = "MmaCore" + str(i)
-            matrix_shape = "cutlass::MatrixShape<" + MmaCore + "::Shape::kK, " + MmaCore + "::Shape::kN>"
-            iterator_map = "typename " + MmaCore + "::IteratorThreadMapB"
-            gen_code += code_using + " = " + gen_ir.gen_declare_template_struct(iterator_typename, \
-                                                matrix_shape, helper.var_idx("ElementB", i), helper.var_idx("LayoutB", i), "0", iterator_map, "AlignmentB_")
-        return gen_code
-    def gen_fragment_iterator(self):
-        gen_code = "using AccumulatorLayout = cutlass::layout::ColumnMajor;\n"
-        for i in range(1, self.b2b_num):
-            code_using = "using FragmentIteratorA" + str(i)
-            iterator_typename = "cutlass::gemm::warp::MmaTensorOpPureFragmentIterator"
-            curr_MmaCore = "MmaCore" + str(i)
-            prev_MmaCore = "MmaCore" + str(i - 1)
-            Matrix_shape_curr = "cutlass::MatrixShape<" + curr_MmaCore + "::WarpShape::kM, " + curr_MmaCore + "::InstructionShape::kK>"
-            Matrix_shape_prev = "cutlass::MatrixShape<" + prev_MmaCore + "::WarpShape::kM, " + prev_MmaCore + "::WarpShape::kN>"
-            Curr_shape_kK = curr_MmaCore + "::Shape::kK"
-            gen_code += code_using + " = " + gen_ir.gen_declare_template_struct(iterator_typename, \
-                                                Matrix_shape_curr, Matrix_shape_prev, Curr_shape_kK, \
-                                                    helper.var_idx("ElementAccumulator", i-1), "ElementA", \
-                                                        "AccumulatorLayout", "InstructionShape_", "true")
-        return gen_code
-    def gen_threadblockmma(self):
-        code_using = "using ThreadblockB2bMma"
-        iterator_typename = "cutlass::gemm::threadblock::B2bMmaPipelined"
-        MmaPipelined_param_Mma0_shape = "typename MmaCore0::Shape"
-        MmaPipelined_param_Mma0_iteratorA = "IteratorA0"
-        MmaPipelined_param_Mma0_smemIteratorA = "typename MmaCore0::SmemIteratorA"
-        MmaPipelined_param_Mma0_iteratorB = "IteratorB0"
-        MmaPipelined_param_Mma0_smemIteratorB = "typename MmaCore0::SmemIteratorB"
-        MmaPipelined_param_list = MmaPipelined_param_Mma0_shape + ", " + MmaPipelined_param_Mma0_iteratorA + ", " + MmaPipelined_param_Mma0_smemIteratorA + ", " + MmaPipelined_param_Mma0_iteratorB + ", " + MmaPipelined_param_Mma0_smemIteratorB + ", "
-        for i in range(1, self.b2b_num):
-            MmaPipelined_param_Mma_shape = "typename MmaCore" + str(i) + "::Shape"
-            MmaPipelined_param_Mma_iteratorA = "FragmentIteratorA" + str(i)
-            MmaPipelined_param_Mma_iteratorB = "IteratorB" + str(i)
-            MmaPipelined_param_Mma_smemIteratorB = "typename MmaCore" + str(i) + "::SmemIteratorB"
-            MmaPipelined_param_list += MmaPipelined_param_Mma_shape + ", " + MmaPipelined_param_Mma_iteratorA + ", " + MmaPipelined_param_Mma_iteratorB + ", " + MmaPipelined_param_Mma_smemIteratorB + ", "
-        MmaPipelined_param_list += "ElementAccumulator0, layout::RowMajor, "
-        for i in range(self.b2b_num - 1):
-            epilogue_name = "EpilogueOutputOp" + str(i)
-            MmaPipelined_param_list += epilogue_name + ", "
-        for i in range(self.b2b_num - 1):
-            epilogue_name = "FusedAddBiasEpilogue" + str(i)
-            MmaPipelined_param_list += epilogue_name + ", "
-        for i in range(self.b2b_num):
-            MmaPolicy = "typename MmaCore" + str(i) + "::MmaPolicy"
-            MmaPipelined_param_list += MmaPolicy + ", "
-        cnt = 0
-        for i in range(self.b2b_num):
-            MmaStage = helper.var_idx("Stages", i)
-            final = ", "
-            if cnt == self.b2b_num - 1:
-                final = ""
-            MmaPipelined_param_list += MmaStage + final
-            cnt += 1
-        gen_code = code_using + " = " + gen_ir.gen_declare_template_struct(iterator_typename, MmaPipelined_param_list)
-        return gen_code
-    def gen_code(self):
-        gen_using = ''
-        # Generate default template struct
-        gen_code = gen_ir.gen_template_struct(self.gen_class_name, self.template_param, "", speicalized = None, set_default=False)
-        # Generate specialized template struct
-        mmacore_codebody = self.gen_using_MmaCore(2)
-        iterator_codebody = self.gen_using_Iterator()
-        fragment_iterator_codebody = self.gen_fragment_iterator()
-        epilogue_iterator_codebody = self.gen_using_FusedAddBiasEpilogue()
-        threadBlockMma = self.gen_threadblockmma()
-        specialized_code = mmacore_codebody + iterator_codebody + fragment_iterator_codebody + epilogue_iterator_codebody + threadBlockMma
-        # Specialize layout C -> cutlass::layout::RowMajor
-        rtn_template_args, speicalized_template_args = gen_ir.filtered_param(self.template_param, [ ('LayoutD', "cutlass::layout::RowMajor")], keep_= True)
-        gen_speical_code = gen_ir.gen_template_struct(self.gen_class_name, rtn_template_args, specialized_code, speicalized = speicalized_template_args, set_default=False)
-        code = gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("threadblock", gen_code + gen_speical_code)))
-        return self.gen_include_header() + code
-class gen_b2b_mme_pipelined:
-    def __init__(self, template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root):
-        self.gen_class_name = "B2bMmaPipelined"
-        self.template_param = template_param
-        self.b2b_num = b2b_num
-        self.cutlass_deps_root = cutlass_deps_root
-        self.project_root = project_root
-    def gen_include_header(self):
-        code = '''
-#pragma once
-#include \"{cutlass_dir}cutlass/cutlass.h\"
-#include \"{cutlass_dir}cutlass/array.h\"
-#include \"{cutlass_dir}cutlass/aligned_buffer.h\"
-#include \"{cutlass_dir}cutlass/numeric_conversion.h\"
-#include \"{cutlass_dir}cutlass/numeric_types.h\"
-#include \"{cutlass_dir}cutlass/matrix_shape.h\"
-#include \"{cutlass_dir}cutlass/gemm/gemm.h\"
-#include \"{cutlass_dir}cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h\"
-#include \"../threadblock/b2b_mma_base.h\"\n'''.format(cutlass_dir = self.cutlass_deps_root)
-        return code
-    def gen_using(self):
-        code_using = "using FragmentA0 = typename IteratorA0::Fragment;\n"
-        code_using += "using Base = B2bMmaBase<"
-        for i in range(self.b2b_num):
-            code_using += helper.var_idx("Shape", i) + "_, "
-        for i in range(self.b2b_num):
-            code_using += helper.var_idx("Policy", i) + "_, "
-        for i in range(self.b2b_num):
-            code_using += helper.var_idx("Stage", i) + "_, "
-        code_using = code_using[: -2] + ">;\n"
-        for i in range(self.b2b_num):
-            code_using += helper.var_idx("using FragmentB", i) + helper.var_idx(" = typename IteratorB", i) + "::Fragment;\n"
-            code_using += helper.var_idx("using FragmentC", i) + helper.var_idx(" = typename Policy", i) + "::Operator::FragmentC;\n"
-            code_using += helper.var_idx("using Operator", i) + helper.var_idx(" = typename Policy", i) + "::Operator;\n"
-        for i in range(self.b2b_num - 1):
-            code_using += helper.var_idx("using IteratorC", i) + helper.var_idx(" = typename FusedAddBiasEpilogue", i) + "::OutputTileIterator;\n"
-        code_using += "using ArchTag = typename Policy0::Operator::ArchTag;\n"
-        code_using += "static ComplexTransform const kTransformA0 = Operator0::kTransformA;\n"
-        for i in range(self.b2b_num):
-            code_using += helper.var_idx("static ComplexTransform const kTransformB", i) + helper.var_idx(" = Operator", i) + "::kTransformB;\n"
-        code_using += "private:\n"
-        code_using += "using WarpFragmentA0 = typename Operator0::FragmentA;\n"
-        code_using += "using WarpFragmentB0 = typename Operator0::FragmentB;\n"
-        for i in range(1, self.b2b_num):
-            code_using += helper.var_idx("using WarpFragmentA", i) + helper.var_idx(" = typename FragmentIteratorA", i) + "::Fragment;\n"
-            code_using += helper.var_idx("using WarpFragmentB", i) + helper.var_idx(" = typename Operator", i) + "::FragmentB;\n"
-        code_using += "protected:\n"
-        code_using += "SmemIteratorA0 smem_iterator_A_;\n"
-        for i in range(self.b2b_num):
-            code_using += helper.var_idx("SmemIteratorB", i) +  helper.var_idx(" smem_iterator_B", i) + "_;\n"
-        return code_using
-    def gen_operator(self, first_use_1stage = False):
-        code = ""
-        def gen_operator_param(b2b_num):
-            param_code = ""
-            param_code += "int gemm_k_iterations_0,\n"
-            param_code += helper.var_idx("FragmentC", b2b_num-1) +  helper.var_idx(" &accum", b2b_num-1) + ",\n"
-            param_code += "IteratorA0 iterator_A,\n"
-            for i in range(b2b_num):
-                param_code += helper.var_idx("IteratorB", i) + " " + helper.var_idx("iterator_B", i) + ",\n"
-            param_code += "FragmentC0 const &src_accum, \n"
-            for i in range(b2b_num - 1):
-                param_code += helper.var_idx("OutputOp", i) + " " + helper.var_idx("output_op_", i) + ",\n"
-            for i in range(b2b_num - 1):
-                param_code += helper.var_idx("FusedAddBiasEpilogue", i) + " " + helper.var_idx("epilogue_", i) + ",\n"
-            for i in range(b2b_num - 1):
-                param_code += helper.var_idx("IteratorC", i) + " " + helper.var_idx("iterator_C", i) + ",\n"
-            param_code += "TransformA0 transform_A0 = TransformA0(), \n"
-            for i in range(b2b_num):
-                final = "(),\n"
-                if i == b2b_num - 1:
-                    final = "()\n"
-                param_code += helper.var_idx("TransformB", i) + " " + helper.var_idx("transform_B", i) + " = " +helper.var_idx("TransformB", i) + final
-            return param_code
-        def gen_first_gemm_1stage(b2b_num):
-            accu_code = "     FragmentC0 accum0 = src_accum;\n"
-            if b2b_num == 1:
-                accu_code = "    accum0 = src_accum;\n"
-            code ="\
-\n\
-    FragmentA0 tb_frag_A;\n\
-    FragmentB0 tb_frag_B0;\n\
-\n\
-    int smem_write_stage_idx = 1;\n\
-\n\
-    tb_frag_A.clear();\n\
-    tb_frag_B0.clear();\n\
-\n\
-    // The last kblock is loaded in the prolog\n\
-    iterator_A.load(tb_frag_A);\n\
-    iterator_B0.load(tb_frag_B0);\n\
-\n\
-    ++iterator_A;\n\
-    ++iterator_B0;\n\
-\n\
-    WarpFragmentA0 warp_frag_A0;\n\
-    WarpFragmentB0 warp_frag_B0;\n\
-\n\
-    Operator0 warp_mma0;\n\
-\n\
-    // Avoid reading out of bounds\n\
-    if (gemm_k_iterations_0 <= 1) {\n\
-      iterator_A.clear_mask();\n\
-      iterator_B0.clear_mask();\n\
-    }\n\
-\n\
-    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
-    // shared memory loads (which have the tightest latency requirement).\n\
-\n\
-    //\n\
-    // Mainloop\n\
-    //\n\
-\n\
-    // Note: The main loop does not support Base::WarpGemmIterations == 2.\n\
-    CUTLASS_GEMM_LOOP\n\
-    for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {\n\
-\n\
-      this->smem_iterator_A_.store(tb_frag_A);\n\
-      this->smem_iterator_B0_.store(tb_frag_B0);\n\
-\n\
-      __syncthreads();\n\
-      //\n\
-      // Loop over GEMM K dimension\n\
-      //\n\
-\n\
-      CUTLASS_PRAGMA_UNROLL\n\
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {\n\
-\n\
-        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group\n\
-        // as the case may be.\n\
-\n\
-        this->warp_tile_iterator_A0_.set_kgroup_index(warp_mma_k % Base::kWarpGemmIterations0);\n\
-        this->warp_tile_iterator_B0_.set_kgroup_index(warp_mma_k % Base::kWarpGemmIterations0);\n\
-\n\
-        this->warp_tile_iterator_A0_.load(warp_frag_A0);\n\
-        this->warp_tile_iterator_B0_.load(warp_frag_B0);\n\
-\n\
-        ++this->warp_tile_iterator_A0_;\n\
-        ++this->warp_tile_iterator_B0_;\n\
-\n\
-        warp_mma0(accum0, warp_frag_A0, warp_frag_B0, accum0);\n\
-      }\n\
-      this->warp_tile_iterator_A0_.add_tile_offset({0, -Policy0::kPartitionsK * Base::kWarpGemmIterations0});\n\
-      this->warp_tile_iterator_B0_.add_tile_offset({-Policy0::kPartitionsK * Base::kWarpGemmIterations0, 0});\n\
-\n\
-      __syncthreads();\n\
-      iterator_A.load(tb_frag_A);\n\
-      iterator_B0.load(tb_frag_B0);\n\
-\n\
-      ++iterator_A;\n\
-      ++iterator_B0;\n\
-\n\
-      if(gemm_k_iterations_0 <= 2) {\n\
-        iterator_A.clear_mask();\n\
-        iterator_B0.clear_mask();\n\
-      }\n\
-    }\n"
-            return accu_code + code
-        def gen_first_gemm_2stage(b2b_num):
-            accu_code = "     FragmentC0 accum0 = src_accum;\n"
-            if b2b_num == 1:
-                accu_code = "    accum0 = src_accum;\n"
-            code ="\
-\n\
-    FragmentA0 tb_frag_A;\n\
-    FragmentB0 tb_frag_B0;\n\
-\n\
-    tb_frag_A.clear();\n\
-    tb_frag_B0.clear();\n\
-\n\
-    // The last kblock is loaded in the prolog\n\
-    iterator_A.load(tb_frag_A);\n\
-    iterator_B0.load(tb_frag_B0);\n\
-\n\
-    ++iterator_A;\n\
-    ++iterator_B0;\n\
-\n\
-    this->smem_iterator_A_.store(tb_frag_A);\n\
-    this->smem_iterator_B0_.store(tb_frag_B0);\n\
-\n\
-    ++this->smem_iterator_A_;\n\
-    ++this->smem_iterator_B0_;\n\
-\n\
-    __syncthreads();\n\
-\n\
-    // Pair of fragments used to overlap shared memory loads and math instructions\n\
-    WarpFragmentA0 warp_frag_A0[2];\n\
-    WarpFragmentB0 warp_frag_B0[2];\n\
-\n\
-    this->warp_tile_iterator_A0_.set_kgroup_index(0);\n\
-    this->warp_tile_iterator_B0_.set_kgroup_index(0);\n\
-\n\
-    this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);\n\
-    this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);\n\
-\n\
-    ++this->warp_tile_iterator_A0_;\n\
-    ++this->warp_tile_iterator_B0_;\n\
-\n\
-    Operator0 warp_mma0;\n\
-\n\
-    int smem_write_stage_idx = 1;\n\
-\n\
-    // Avoid reading out of bounds\n\
-    if (gemm_k_iterations_0 <= 1) {\n\
-      iterator_A.clear_mask();\n\
-      iterator_B0.clear_mask();\n\
-    }\n\
-\n\
-    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
-    // shared memory loads (which have the tightest latency requirement).\n\
-    iterator_A.load(tb_frag_A);\n\
-\n\
-    //\n\
-    // Mainloop\n\
-    //\n\
-\n\
-    // Note: The main loop does not support Base::WarpGemmIterations == 2.\n\
-    CUTLASS_GEMM_LOOP\n\
-    for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {\n\
-\n\
-      //\n\
-      // Loop over GEMM K dimension\n\
-      //\n\
-\n\
-      CUTLASS_PRAGMA_UNROLL\n\
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {\n\
-\n\
-        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group\n\
-        // as the case may be.\n\
-\n\
-        if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {\n\
-\n\
-          // Write fragments to shared memory\n\
-          this->smem_iterator_A_.store(tb_frag_A);\n\
-\n\
-          this->smem_iterator_B0_.store(tb_frag_B0);\n\
-\n\
-          __syncthreads();\n\
-\n\
-          // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\
-          // shared memory loads (which have the tightest latency requirement).\n\
-          iterator_A.load(tb_frag_A);\n\
-          \n\
-          ++this->smem_iterator_B0_;\n\
-          ++this->smem_iterator_A_;\n\
-        \n\
-\n\
-          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory\n\
-          if (smem_write_stage_idx == 1) {\n\
-            this->smem_iterator_A_.add_tile_offset({0, -Base::Stage0});\n\
-            this->smem_iterator_B0_.add_tile_offset({-Base::Stage0, 0});\n\
-          }\n\
-          else {\n\
-            this->warp_tile_iterator_A0_.add_tile_offset(\n\
-                {0, -Base::Stage0 * Policy0::kPartitionsK * Base::kWarpGemmIterations0});\n\
-            this->warp_tile_iterator_B0_.add_tile_offset(\n\
-                {-Base::Stage0 * Policy0::kPartitionsK * Base::kWarpGemmIterations0,\n\
-                 0});\n\
-          }\n\
-\n\
-          smem_write_stage_idx ^= 1;\n\
-        }\n\
-\n\
-        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);\n\
-        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);\n\
-        \n\
-        this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);\n\
-        this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);\n\
-\n\
-        ++this->warp_tile_iterator_A0_;\n\
-        ++this->warp_tile_iterator_B0_;\n\
-\n\
-        if (warp_mma_k == 0) {\n\
-\n\
-          iterator_B0.load(tb_frag_B0);\n\
-\n\
-          ++iterator_A;\n\
-          ++iterator_B0;\n\
-\n\
-          // Avoid reading out of bounds if this was the last loop iteration\n\
-          if (gemm_k_iterations_0 <= 2) {\n\
-            iterator_A.clear_mask();\n\
-            iterator_B0.clear_mask();\n\
-          }\n\
-        }\n\
-\n\
-        warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2], warp_frag_B0[warp_mma_k % 2], accum0);\n\
-      }\n\
-    }\n"
-            return accu_code + code
-        def gen_other_gemms_2stage(b2b_num):
-            code = ""
-            def gemm_teamplate(id):
-                code = "// " + str(id + 1) + " Gemm"
-                code += "    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile\n"
-                code += "    " + helper.var_idx("FragmentC", id - 1) + helper.var_idx(" after_epilogue_accu", id - 1) + ";\n"
-                code += "    " + helper.var_idx("epilogue_", id - 1) + helper.var_idx("(output_op_", id - 1) + helper.var_idx(", accum", id - 1) \
-                               + helper.var_idx(", after_epilogue_accu", id - 1) + helper.var_idx(", iterator_C", id - 1) +");\n"
-                #    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
-                code += "    " + helper.var_idx("FragmentIteratorA", id) + helper.var_idx(" warp_tile_iterator_A", id) +"_(" + helper.var_idx("after_epilogue_accu", id - 1) + ");\n"
-                #    FragmentB1 tb_frag_B1;
-                code += "    " +  helper.var_idx("FragmentB", id) + " " + helper.var_idx("tb_frag_B", id) + ";\n"
-                #    tb_frag_B1.clear();
-                code += "    " +  helper.var_idx("tb_frag_B", id)  + ".clear();\n"
-                #    iterator_B1.load(tb_frag_B1);
-                code += "    " + helper.var_idx("iterator_B", id) + ".load(" + helper.var_idx("tb_frag_B", id) + ");\n"
-                #    ++iterator_B1;
-                code += "    " +  "++" +  helper.var_idx("iterator_B", id) + ";\n"
-                #    this->smem_iterator_B1_.store(tb_frag_B1);
-                code += "    " +  helper.var_idx("this->smem_iterator_B", id) + "_.store(" + helper.var_idx("tb_frag_B", id) + ");\n"
-                #    ++this->smem_iterator_B1_;
-                code += "    " +  helper.var_idx("++this->smem_iterator_B", id) + "_;\n"
-                #    __syncthreads();
-                code += "    " +  "__syncthreads();\n"
-                #    WarpFragmentA1 warp_frag_A1[2];
-                code += "    " + helper.var_idx("WarpFragmentA", id) + helper.var_idx(" warp_frag_A", id) + "[2];\n"
-                #    WarpFragmentB1 warp_frag_B1[2];
-                code += "    " + helper.var_idx("WarpFragmentB", id) + helper.var_idx(" warp_frag_B", id) + "[2];\n"
-                #    this->warp_tile_iterator_B1_.set_kgroup_index(0);
-                code += "    " + helper.var_idx("this->warp_tile_iterator_B", id) + "_.set_kgroup_index(0);\n"
-                #    warp_tile_iterator_A1_.load(warp_frag_A1[0], output_op_0);
-                code += "    " + helper.var_idx("warp_tile_iterator_A", id) + helper.var_idx("_.load(warp_frag_A", id) + "[0]);\n"
-                #    this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
-                code += "    " + helper.var_idx("this->warp_tile_iterator_B", id) + helper.var_idx("_.load(warp_frag_B", id) + "[0]);\n"
-                #    ++warp_tile_iterator_A1_;
-                code +=  "    " + helper.var_idx("++warp_tile_iterator_A", id) + "_;\n"
-                #    ++this->warp_tile_iterator_B1_;
-                code +=  "    " + helper.var_idx("++this->warp_tile_iterator_B", id) + "_;\n"
-                #    Operator1 warp_mma1;
-                code +=  "    " + helper.var_idx("Operator", id) + " " + helper.var_idx("warp_mma", id) + ";\n"
-                #    smem_write_stage_idx = 1;
-                code +=  "    " + "smem_write_stage_idx = 1;\n"
-                #    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
-                code += "    " + helper.var_idx("int gemm_k_iterations_", id) + " = " + helper.var_idx("FragmentIteratorA", id) + helper.var_idx("::Policy::kIterations / Base::kWarpGemmIterations", id) +";\n"
-                #    if (gemm_k_iterations_1 <= 1) {
-                #      iterator_B1.clear_mask();
-                #    }
-                code += "    "  + "if ("  + helper.var_idx("gemm_k_iterations_", id) + " <= 1 ){\n" \
-                    + "    "  + "    " + helper.var_idx("iterator_B", id) + ".clear_mask();\n" \
-                    + "    "  +"}\n"
-                #    CUTLASS_PRAGMA_UNROLL
-                code += "    " + "CUTLASS_PRAGMA_UNROLL\n"
-                #    for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
-                code += "    " + helper.var_idx("for (; gemm_k_iterations_", id) + helper.var_idx(" > 0; --gemm_k_iterations_", id) + ") {\n"
-                #      CUTLASS_PRAGMA_UNROLL
-                code += "    " + "    " + "CUTLASS_PRAGMA_UNROLL\n"
-                #      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
-                code += "    " + "    " + helper.var_idx("for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations", id) + "; ++warp_mma_k) {\n"
-                #        if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
-                code += "    " + "    " + "    " + helper.var_idx("if (warp_mma_k == Base::kWarpGemmIterations", id) + " - 1) {\n"
-                #          this->smem_iterator_B1_.store(tb_frag_B1);
-                code += "    " + "    " + "    " + "    " + helper.var_idx(" this->smem_iterator_B", id) + helper.var_idx("_.store(tb_frag_B", id) + ");\n"
-                #          __syncthreads();
-                code += "    " + "    " + "    " + "    " + "__syncthreads();\n"
-                #          ++smem_iterator_B1_;
-                code += "    " + "    " + "    " + "    " + helper.var_idx(" ++smem_iterator_B", id)  + "_;\n"
-                #          if (smem_write_stage_idx == 1) {
-                #            smem_iterator_B1_.add_tile_offset({-Base::Stage, 0});
-                #          }
-                code += "    " + "    " + "    " + "    "  + "if ( smem_write_stage_idx == 1 ) {\n" \
-                    + "    " + "    " + "    " + "    " + "    " + helper.var_idx("smem_iterator_B", id) + helper.var_idx("_.add_tile_offset({-Base::Stage", i) + ", 0});\n" \
-                    + "    " + "    " + "    " + "    "  +"}\n"
-                #          else {
-                #            this->warp_tile_iterator_B1_.add_tile_offset(
-                #                {-Base::Stage * Policy1::kPartitionsK *
-                #                     Base::kWarpGemmIterations1,
-                #                 0});
-                #          }
-                code += "    " + "    " + "    " + "    "  + "else {\n" \
-                    + "    " + "    " + "    " + "    " + "    " + helper.var_idx("this->warp_tile_iterator_B", id) + "_.add_tile_offset(\n" \
-                    + "    " + "    " + "    " + "    " + "    " + helper.var_idx("{-Base::Stage", id) + helper.var_idx(" * Policy", id) + "::kPartitionsK *\n" \
-                    + "    " + "    " + "    " + "    " + "    " + helper.var_idx("Base::kWarpGemmIterations", id) + ",\n" \
-                    + "    " + "    " + "    " + "    " + "    " + "0});\n" \
-                    + "    " + "    " + "    " + "    "  + "}\n"
-                #          smem_write_stage_idx ^= 1;
-                #        }
-                code += "    " + "    " + "    " + "    "  + "smem_write_stage_idx ^= 1;\n" \
-                    + "    " + "    " + "    " + "}\n"
-                #        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
-                code += "    " + "    " + "    " + helper.var_idx("this->warp_tile_iterator_B", id) + helper.var_idx("_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations", id) + ");\n"
-                #        warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
-                code += "    " + "    " + "    " + helper.var_idx("warp_tile_iterator_A", id) + helper.var_idx("_.load(warp_frag_A", id) + "[(warp_mma_k + 1) % 2]);\n"
-                #        this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
-                code += "    " + "    " + "    " + helper.var_idx("this->warp_tile_iterator_B", id) + helper.var_idx("_.load(warp_frag_B", id) + "[(warp_mma_k + 1) % 2]);\n"
-                #        ++warp_tile_iterator_A1_;
-                code += "    " + "    " + "    " + helper.var_idx("++warp_tile_iterator_A", id) + "_;\n"
-                #        ++this->warp_tile_iterator_B1_;
-                code += "    " + "    " + "    " + helper.var_idx("++this->warp_tile_iterator_B", id) + "_;\n"
-                #        if (warp_mma_k == 0) {
-                #          iterator_B1.load(tb_frag_B1);
-                #          ++iterator_B1;
-                #          if (gemm_k_iterations_1 <= 2) {
-                #            iterator_B1.clear_mask();
-                #          }
-                #        }
-                code += "    " + "    " + "    " + " if (warp_mma_k == 0) {\n" \
-                    + "    " + "    " + "    " + "    " + helper.var_idx("iterator_B", id) + helper.var_idx(".load(tb_frag_B", id) + ");\n" \
-                    + "    " + "    " + "    " + "    " + helper.var_idx("++iterator_B", id) +";\n" \
-                    + "    " + "    " + "    " + "    " + helper.var_idx("if (gemm_k_iterations_", id) +" <= 2) {\n" \
-                    + "    " + "    " + "    " + "    " + "    " + helper.var_idx("iterator_B", id) + ".clear_mask();\n" \
-                    + "    " + "    " + "    " + "    " + "}\n" \
-                    + "    " + "    " + "    " + "}\n"
-                #        warp_mma1(accum, warp_frag_A1[warp_mma_k % 2], warp_frag_B1[warp_mma_k % 2], accum);
-                #      }
-                #    }
-                code += "    " + "    " + "    " + helper.var_idx("warp_mma", id) + helper.var_idx("(accum", id) + helper.var_idx(", warp_frag_A", id) + helper.var_idx("[warp_mma_k % 2], warp_frag_B", id) + helper.var_idx("[warp_mma_k % 2], accum", id) + ");\n" \
-                    + "    " + "    " + "}\n" \
-                    + "    " + "}\n\n\n"
-                return code
-            for i in range (1, b2b_num):
-                clear_accu = ""
-                if i != b2b_num - 1:
-                    clear_accu = "    " + helper.var_idx("FragmentC", i) +  helper.var_idx(" accum", i) +";\n"
-                    clear_accu += "    " + helper.var_idx("accum", i) +".clear();\n"
-                code += clear_accu + gemm_teamplate(i)
-            return code
-        operator_code = " CUTLASS_DEVICE\n\
-  void operator()(\n " + gen_operator_param(self.b2b_num) + ") {\n"
-        if first_use_1stage:
-            operator_code += gen_first_gemm_1stage(self.b2b_num)
-        else:
-            operator_code += gen_first_gemm_2stage(self.b2b_num)
-        operator_code += gen_other_gemms_2stage(self.b2b_num) + "}\n"
-        return operator_code
-    def gen_construct_func(self):
-        name = self.gen_class_name
-        func_code = "CUTLASS_DEVICE\n"
-        func_code += name + "(\n" \
-                    + "    " + "typename Base::B2bMmaSharedStorage &shared_storage,\n" \
-                    + "    " + "int thread_idx,\n" \
-                    + "    " + "int warp_idx,\n" \
-                    + "    " + "int lane_idx\n" \
-                    + "):\n"
-        func_code +=  "    " + "Base(shared_storage, thread_idx, warp_idx, lane_idx),\n" \
-                    + "    " + "smem_iterator_A_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx),\n"
-        for i in range(self.b2b_num):
-            final = ",\n"
-            if i == self.b2b_num - 1:
-                final = " {\n"
-            func_code += helper.var_idx("smem_iterator_B", i) + helper.var_idx("_(shared_storage.sharedStorage", i) +".operand_B_ref(), thread_idx)" + final
-        func_code +=  "    " + "int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);\n"
-        func_code +=  "    " + "int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);\n"
-        func_code +=  "    " + "int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;\n"
-        func_code +=  "    " + "int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;\n"
-        for i in range(self.b2b_num):
-            func_code +=  "    " + helper.var_idx("int tile_offset_k", i) + helper.var_idx(" = Base::kWarpGemmIterations", i) + " * warp_idx_k;\n"
-        func_code +=  "    " + "this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k0});\n"
-        for i in range(self.b2b_num):
-            func_code +=  "    " + helper.var_idx("this->warp_tile_iterator_B", i) + helper.var_idx("_.add_tile_offset({tile_offset_k", i) + ", warp_idx_n});\n"
-        func_code += "}\n"
-        return func_code
-    def gen_member_func(self, first_use_1stage):
-        code = "public:\n"
-        code += self.gen_operator(first_use_1stage)
-        code += self.gen_construct_func()
-        return code
-    def gen_code(self, first_use_1stage):
-        def gen_template_args(b2b_num):
-            template_param = []
-            template_param.append(("typename", "Shape0"))
-            template_param.append(("typename", "IteratorA0"))
-            template_param.append(("typename", "SmemIteratorA0"))
-            template_param.append(("typename", "IteratorB0"))
-            template_param.append(("typename", "SmemIteratorB0"))
-            for i in range(1, b2b_num):
-                template_param.append(("typename", helper.var_idx("Shape", i)))
-                template_param.append(("typename", helper.var_idx("FragmentIteratorA", i)))
-                template_param.append(("typename", helper.var_idx("IteratorB", i)))
-                template_param.append(("typename", helper.var_idx("SmemIteratorB", i)))
-            template_param.append(("typename", "ElementC"))
-            template_param.append(("typename", "LayoutC"))
-            for i in range(0, b2b_num - 1):
-                template_param.append(("typename", helper.var_idx("OutputOp", i)))
-            for i in range(0, b2b_num - 1):
-                template_param.append(("typename", helper.var_idx("FusedAddBiasEpilogue", i)))
-            for i in range(0, b2b_num):
-                template_param.append(("typename", helper.var_idx("Policy", i)))
-            for i in range(0, b2b_num):
-                template_param.append((int, helper.var_idx("Stage", i)))
-            template_param.append(("typename","TransformA0", "NumericArrayConverter<typename SmemIteratorA0_::Element, typename IteratorA0_::Element, IteratorA0_::Fragment::kElements>"))
-            for i in range(0, b2b_num):
-                cvtr = helper.var_idx("NumericArrayConverter<typename SmemIteratorB", i) + helper.var_idx("_::Element, typename IteratorB", i) + helper.var_idx("_::Element, IteratorB", i) + "_::Fragment::kElements>"
-                template_param.append(("typename", helper.var_idx("TransformB", i), cvtr))
-            template_param.append(("typename", "Enable", "bool"))
-            return template_param
-        template_param = gen_template_args(self.b2b_num)
-        inheritance_code = "public B2bMmaBase<"
-        for i in range(self.b2b_num):
-            inheritance_code += helper.var_idx("Shape", i) + "_, "
-        for i in range(self.b2b_num):
-            inheritance_code += helper.var_idx("Policy", i) + "_, "
-        for i in range(self.b2b_num - 1):
-            inheritance_code += helper.var_idx("Stage", i) + "_, "
-        inheritance_code += helper.var_idx("Stage", self.b2b_num - 1) + "_"
-        inheritance_code += ">"
-        code_body = ""
-        using_code= self.gen_using()
-        func_code = self.gen_member_func(first_use_1stage)
-        code_body = using_code + func_code
-        class_code = gen_ir.gen_template_class(self.gen_class_name, template_param, code_body, inheritance_code = inheritance_code)
-        code = self.gen_include_header()
-        code += gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("threadblock", class_code)))
-        # print(code)
-        return code
-class gen_b2b_mma_base:
-    def __init__(self, template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root):
-        self.gen_class_name = gen_class_name
-        self.template_param = template_param
-        self.b2b_num = b2b_num
-        self.cutlass_deps_root = cutlass_deps_root
-        self.project_root = project_root
-    def gen_include_header(self):
-        code = '''
-#pragma once
-#include \"{cutlass_dirs}cutlass/aligned_buffer.h\"
-#include \"{cutlass_dirs}cutlass/arch/memory.h\"
-#include \"{cutlass_dirs}cutlass/array.h\"
-#include \"{cutlass_dirs}cutlass/cutlass.h\"
-#include \"{cutlass_dirs}cutlass/gemm/gemm.h\"
-#include \"{cutlass_dirs}cutlass/matrix_shape.h\"
-#include \"{cutlass_dirs}cutlass/numeric_types.h\"\n'''.format(cutlass_dirs=self.cutlass_deps_root)
-        return code
-    def gen_shared_storage(self):
-        code = \
-" template< \n\
-    typename Shape_,\n\
-    typename Policy_,\n\
-    int ThisStage_\n\
->\n\
-class SharedStorage {\n\
-public:\n\
-    using Shape = Shape_;\n\
-    using Policy = Policy_;\n\
-    static int const ThisStage = ThisStage_;\n\
-    using Operator = typename Policy::Operator;\n\
-    \
-    using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;\n\
-    \
-    /// Tensor reference to the B operand \n\
-    using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;\n\
-\n\
-    /// Shape of the A matrix operand in shared memory \n\
-    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,\n\
-                               Shape::kK * ThisStage +\n\
-                                   Policy::SmemPaddingA::kColumn>;\n\
-\n\
-    /// Shape of the B matrix operand in shared memory\n\
-    using ShapeB =\n\
-        MatrixShape<Shape::kK * ThisStage + Policy::SmemPaddingB::kRow,\n\
-                    Shape::kN + Policy::SmemPaddingB::kColumn>;\n\
-\n\
-   public:\n\
-\n\
-    /// Buffer for A operand\n\
-    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;\n\
-\n\
-    /// Buffer for B operand\n\
-    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;\n\
-\n\
-   public:\n\
-\n\
-    /// Returns a layout object for the A matrix\n\
-    CUTLASS_DEVICE\n\
-    static typename Operator::LayoutA LayoutA() {\n\
-      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});\n\
-    }\n\
-\n\
-    /// Returns a layout object for the B matrix\n\
-    CUTLASS_HOST_DEVICE\n\
-    static typename Operator::LayoutB LayoutB() {\n\
-      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});\n\
-    }\n\
-\n\
-    /// Returns a TensorRef to the A operand\n\
-    CUTLASS_HOST_DEVICE\n\
-    TensorRefA operand_A_ref() {\n\
-      return TensorRefA{operand_A.data(), LayoutA()};\n\
-    }\n\
-\n\
-    /// Returns a TensorRef to the B operand\n\
-    CUTLASS_HOST_DEVICE\n\
-    TensorRefB operand_B_ref() {\n\
-      return TensorRefB{operand_B.data(), LayoutB()};\n\
-    }\n\
-    CUTLASS_HOST_DEVICE\n\
-    void * get_B_Shared_ptr() {\n\
-      return operand_B.data();\n\
-    }\n\
-  };\n"
-        return code
-    def gen_using_and_misc(self, b2b_num):
-        code_using = ""
-        for i in range(b2b_num):
-            code_using += "using Operator" +str(i) + " = typename Policy" + str(i) +"::Operator;\n"
-        for i in range(b2b_num):
-            code_using += "using WarpGemm" +str(i) + " = typename Policy" + str(i) +"::Operator::Shape;\n"
-        for i in range(b2b_num):
-            code_using += "using WarpCount" +str(i) + " = GemmShape<"   + helper.var_idx("Shape", i) +"::kM / " + helper.var_idx("WarpGemm", i) +"::kM, "\
-                                                                        + helper.var_idx("Shape", i) +"::kN / " + helper.var_idx("WarpGemm", i) +"::kN, "\
-                                                                        + helper.var_idx("Shape", i) +"::kK / " + helper.var_idx("WarpGemm", i) +"::kK>;\n"
-        code_misc = ""
-        for i in range(b2b_num):
-            code_misc += "static int const " + helper.var_idx("kWarpGemmIterations", i) + " = (" + helper.var_idx("WarpGemm", i) + "::kK / " + helper.var_idx("Operator", i) +"::Policy::MmaShape::kK);\n"
-        code = code_using + code_misc + self.gen_shared_storage()
-        for i in range(b2b_num):
-            code += "using " + helper.var_idx("SharedStorage", i) + " = SharedStorage<" + helper.var_idx("Shape", i) + ", " + helper.var_idx("Policy", i) +", " +  helper.var_idx("Stage", i) + ">;\n"
-        def gen_union_shared_storage(b2b_num):
-            code = ""
-            for i in range(b2b_num):
-                code += "    " +helper.var_idx("SharedStorage", i) + " " + helper.var_idx("sharedStorage", i) +";\n"
-            return code
-        code += "union B2bMmaSharedStorage {\n" + gen_union_shared_storage(self.b2b_num) + "};\n"
-        for i in range(b2b_num - 1):
-            code += helper.var_idx("void * C", i) + "_smm_ptr;\n"
-        return code
-    def gen_protected(self):
-        code = "\nprotected:\n"
-        code += "typename Operator0::IteratorA warp_tile_iterator_A0_;\n"
-        for i in range(self.b2b_num):
-            code += "typename Operator" +str(i) + "::IteratorB" +" warp_tile_iterator_B" + str(i) + "_;\n"
-        return code
-    def gen_public_member(self):
-        code = "\npublic:\n"
-        code += "CUTLASS_DEVICE\n"
-        code += \
-        "B2bMmaBase(\n" + \
-        "    B2bMmaSharedStorage & shared_storage,\n" + \
-        "    int thread_idx,\n" + \
-        "    int warp_idx,\n" + \
-        "    int lane_idx\n" + \
-        "):\n" + \
-        " warp_tile_iterator_A0_(shared_storage.sharedStorage0.operand_A_ref(), lane_idx),\n"
-        for i in range(self.b2b_num):
-            final = ",\n"
-            if i == self.b2b_num-1:
-                final = "\n"
-            iterator = " warp_tile_iterator_B" + str(i) + "_"
-            shared_storage = "shared_storage.sharedStorage" + str(i) + ".operand_B_ref()"
-            code += iterator + "(" + shared_storage + ", lane_idx)" + final
-        code += "{\n"
-        for i in range(self.b2b_num - 1):
-            code += helper.var_idx("    C", i) +  helper.var_idx("_smm_ptr = shared_storage.sharedStorage", i) + ".get_B_Shared_ptr();\n"
-        code += "}\n"
-        return code
-    def gen_code(self):
-        template_arg = []
-        for i in range(self.b2b_num):
-            template_arg.append(("typename", helper.var_idx("Shape", i)))
-        for i in range(self.b2b_num):
-            template_arg.append(("typename", helper.var_idx("Policy", i)))
-        for i in range(self.b2b_num):
-            template_arg.append((int, helper.var_idx("Stage", i)))
-        code_body = self.gen_using_and_misc(self.b2b_num)
-        code_body += self.gen_protected()
-        code_body += self.gen_public_member()
-        class_code = gen_ir.gen_template_class("B2bMmaBase", template_arg, code_body)
-        code = self.gen_include_header() + gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("threadblock", class_code)))
-        return code
-class gen_threadblock:
-    def __init__(self, template_param, gen_class_name, b2b_num, output_dir, cutlass_deps_root, project_root):
-        self.gen_class_name = gen_class_name
-        self.template_param = template_param
-        self.b2b_num = b2b_num
-        self.file_dir = output_dir + "/threadblock/"
-        self.cutlass_deps_root = cutlass_deps_root
-        self.project_root = project_root
-        self.gen_b2b_mma_base = gen_b2b_mma_base(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
-        self.gen_b2b_mma_pipelined = gen_b2b_mme_pipelined(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
-        self.gen_default_b2b_mma = gen_default_b2b_mma(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root)
-    def gen_code(self, first_use_1stage):
-        base_code = self.gen_b2b_mma_base.gen_code()
-        print("[INFO]: Gen kernel code [b2b_mma_base.h]output Dir: is ", self.file_dir)
-        with open(self.file_dir + "b2b_mma_base.h", "w+") as f:
-            f.write(base_code)
-        pipeline_code = self.gen_b2b_mma_pipelined.gen_code(first_use_1stage = first_use_1stage)
-        print("[INFO]: Gen kernel code [b2b_mma_pipelined.h]output Dir: is ", self.file_dir)
-        with open(self.file_dir + "b2b_mma_pipelined.h", "w+") as f:
-            f.write(pipeline_code)
-        default_code = self.gen_default_b2b_mma.gen_code()
-        print("[INFO]: Gen kernel code [default_b2b_mma.h]output Dir: is ", self.file_dir)
-        with open(self.file_dir + "default_b2b_mma.h", "w+") as f:
-            f.write(default_code)

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_turing_and_volta.py DELETED Viewed

@@ -1,456 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-import helper
-import gen_ir as ir
-class gen_turing_impl:
-    def __init__(self,fuse_gemm_info, gen_class_name, user_header_file, output_dir = "../"):
-        self.fuse_gemm_info = fuse_gemm_info
-        self.class_name = gen_class_name
-        self.gen_class_name = gen_class_name + "_turing_impl"
-        self.user_header_file = ""
-        for header in user_header_file:
-            self.user_header_file += "#include \"" + header + "\"\n"
-        self.output_dir = output_dir
-        self.b2b_num = len(fuse_gemm_info)
-        self.gen_turing_unfused = gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
-    def gen_using(self):
-        code_using = "using b2b_gemm = typename cutlass::gemm::device::" + self.class_name + "<cutlass::half_t>;"
-        return code_using + "\n"
-    def gen_initialize(self):
-        code = ""
-        for i in range(self.b2b_num):
-            code_this = ""
-            code_this += helper.var_idx(helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + " alpha", i) + " = " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + "(1);\n"
-            beta = "(1)"
-            if helper.get_epilogue_add_bias_or_not(self.fuse_gemm_info[i]) is False:
-                beta = "(0)"
-            code_this += helper.var_idx(helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + " beta", i) + " = " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + beta + ";\n"
-            k_str = str(self.fuse_gemm_info[i]['mnk'][2])
-            if i == 0:
-                k_str = "K0"
-            code_this += helper.var_idx("cutlass::gemm::GemmCoord problem_size_", i) + "(M, " + str(self.fuse_gemm_info[i]['mnk'][1]) + ", " + k_str + ");\n"
-            code += code_this
-        code += "typename b2b_gemm::Arguments arguments{\n"
-        for i in range(self.b2b_num):
-            code += "    " + helper.var_idx("problem_size_", i) + ",\n"
-        code += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_tp']) + "*>(" + helper.var_idx("A", 0) + "), " + helper.var_idx("problem_size_", 0) + ".k()},\n"
-        for i in range(self.b2b_num):
-            ldmB = str(self.fuse_gemm_info[i]['mnk'][2])
-            if i == 0:
-                ldmB = "K0"
-            if self.fuse_gemm_info[i]['B_format'] is 'Row':
-                ldmB = str(self.fuse_gemm_info[i]['mnk'][1])
-            ldmC = str(helper.get_epilogue_bias_ldm(self.fuse_gemm_info[i]))
-            code += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['B_tp']) + "*>(" + helper.var_idx("B", i) + "), " + ldmB + "},\n"
-            code += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + "*>(" + helper.var_idx("C", i) + "), " + ldmC + "},\n"
-        code += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + "*>(" + helper.var_idx("D", self.b2b_num -1) + "), " + helper.var_idx("problem_size_", self.b2b_num - 1) + ".n()},\n"
-        for i in range(self.b2b_num):
-            code += "    " + "{ " + helper.var_idx("alpha", i) + ", " + helper.var_idx("beta", i)
-            for epilogue_arg in  helper.get_epilogue_args(self.fuse_gemm_info[i]):
-                arg_name = helper.var_idx("Epilogue", i) + "_" +  epilogue_arg[1]
-                code += ", " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + "(" + str(arg_name) + ")"
-            code += "},\n"
-        code += "    " + "Batch};\n\n"
-        code += "    " "b2b_gemm gemm_op;\n"
-        code += "    " + "gemm_op.initialize(arguments);\n"
-        return code + "\n"
-    def gen_run(self):
-        code = "    " + "gemm_op(stream);\n"
-        return code
-    def gen_wrapper(self):
-        code_body = ""
-        arg_lists = []
-        arg_lists.append(["int", "M"])
-        arg_lists.append(["int", "K0"])
-        arg_lists.append(["int", "Batch"])
-        arg_lists.append(["void*", helper.var_idx("A", 0)])
-        for i in range(self.b2b_num):
-            arg_lists.append(["void*", helper.var_idx("B", i)])
-            arg_lists.append(["void*", helper.var_idx("C", i)])
-            arg_lists.append(["void*", helper.var_idx("D", i)])
-            epilogue_args = helper.get_epilogue_args(self.fuse_gemm_info[i])
-            acc_tp = helper.get_epilogue_compute_tp(self.fuse_gemm_info[i])
-            for arg in epilogue_args:
-                arg_tp = arg[0]
-                arg_name = helper.var_idx("Epilogue", i) + "_" +  arg[1]
-                arg_lists.append([arg_tp, arg_name])
-        if self.b2b_num == 1:
-            code_body += self.gen_turing_unfused.gen_using(False)  #False -> Turing, True -> Volta
-            code_body += self.gen_turing_unfused.gen_initialize()
-            code_body += self.gen_turing_unfused.gen_run()
-        else:
-            code_body += self.gen_using()
-            code_body += self.gen_initialize()
-            code_body += self.gen_run()
-        code = ir.gen_func(self.gen_class_name, arg_lists, code_body)
-        return code
-    def gen_code(self):
-        code = self.gen_wrapper()
-        helper.write_2_headfile("turing_impl.h", self.output_dir, self.user_header_file + "\n" + code)
-class gen_volta_turing_fuse_act_impl:
-    def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, output_dir = "../"):
-        self.fuse_gemm_info = fuse_gemm_info
-        self.gen_class_name = gen_class_name + "_volta_impl"
-        self.user_header_file = ""
-        for header in user_header_file:
-            self.user_header_file +=  "#include \"" + header + "\"\n"
-        self.output_dir = output_dir
-        self.b2b_num = len(fuse_gemm_info)
-    def perf_tiling(self, layer_mnk):
-        mnk = layer_mnk[:]
-        block_tile = mnk[:]
-        block_tile[2] = 32 # force the K tile to be 32
-        # M tile gen
-        block_tile[0] = 32
-        # N tile gen
-        if mnk[1] > 128:
-            block_tile[1] = 256
-        elif mnk[1] > 64:
-            block_tile[1] = 128
-        elif mnk[1] > 32:
-            block_tile[1] = 64
-        else :
-            block_tile[1] = 32
-        warp_tile = block_tile[:]
-        if block_tile[1] == 256:
-            warp_tile[1] = 64
-        elif block_tile[1] == 128:
-            warp_tile[1] = 32
-        elif block_tile[1] == 64:
-            warp_tile[1] = 32
-        else :
-            warp_tile[1] = 32
-        warp_tile[0] = 32
-        return block_tile, warp_tile
-    def process_epilogue(self, epilogue_tp, n, C_tp, Acc_tp):
-        epilogue_setted_type = epilogue_tp
-        cutlass_epilogue_name = "LinearCombinationRelu"
-        if epilogue_setted_type.lower() == 'leakyrelu':
-            cutlass_epilogue_name = "LinearCombinationLeakyRelu"
-        elif epilogue_setted_type.lower() == 'identity':
-            cutlass_epilogue_name = "LinearCombination"
-        n_mod_8 = n % 4
-        N_align_elements = 1
-        if n_mod_8 == 0:
-            N_align_elements = 8
-        elif n_mod_8 == 4:
-            N_align_elements = 4
-        elif n_mod_8 == 2 or n_mod_8 == 6:
-            N_align_elements = 2
-        epilogue_str = "cutlass::epilogue::thread::" + cutlass_epilogue_name+ "<" + C_tp + ", " + str(N_align_elements) + ", " + Acc_tp + ", " + Acc_tp + ">"
-        return epilogue_str
-    def gen_using(self, volta = True):
-        code_using = ""
-        volta_arch = "cutlass::arch::Sm70"
-        volta_tc = "cutlass::gemm::GemmShape<8, 8, 4>"
-        turing_arch = "cutlass::arch::Sm75"
-        turing_tc = "cutlass::gemm::GemmShape<16, 8, 8>"
-        arch = ""
-        tc = ""
-        if volta:
-            arch = volta_arch
-            tc = volta_tc
-        else:
-            arch = turing_arch
-            tc = turing_tc
-        for i in range(self.b2b_num):
-            k = self.fuse_gemm_info[i]['mnk'][2]
-            k_mod_8 = k % 4
-            ab_ldm = 1
-            if k_mod_8 == 0:
-                ab_ldm = 8
-            elif k_mod_8 == 4:
-                ab_ldm = 4
-            elif k_mod_8 == 2 or k_mod_8 == 6:
-                ab_ldm = 2
-            block_tile, warp_tile = self.perf_tiling(self.fuse_gemm_info[i]['mnk'])
-            this_gemm_config =  helper.var_idx("using Gemm", i) + " = cutlass::gemm::device::GemmBatched<\n"
-            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_tp']) + ",\n"
-            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_format']) + ",\n"
-            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['B_tp']) + ",\n"
-            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['B_format']) + ",\n"
-            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + ",\n"
-            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_format']) + ",\n"
-            this_gemm_config += "    " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + ",\n"
-            this_gemm_config += "    " + "cutlass::arch::OpClassTensorOp,\n"
-            this_gemm_config += "    " + arch + ",\n"
-            this_gemm_config += "    " + "cutlass::gemm::GemmShape<" + str(block_tile[0]) + ", " + str(block_tile[1]) + ", " + str(block_tile[2]) + ">,\n"
-            this_gemm_config += "    " + "cutlass::gemm::GemmShape<" + str(warp_tile[0]) + ", " + str(warp_tile[1]) + ", " + str(warp_tile[2]) + ">,\n"
-            this_gemm_config += "    " + tc + ",\n"
-            this_gemm_config += "    " + self.process_epilogue(helper.get_epilogue_tp(self.fuse_gemm_info[i]), self.fuse_gemm_info[i]['mnk'][1], helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']), helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp'])) + ",\n"
-            this_gemm_config += "    " + "cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,\n"
-            this_gemm_config += "    " + "2,\n"
-            this_gemm_config += "    " + str(ab_ldm) + ",\n"
-            this_gemm_config += "    " + str(ab_ldm) + ">;\n"
-            code_using += this_gemm_config + "\n"
-        return code_using + "\n"
-    def gen_initialize(self):
-        code = ""
-        for i in range(self.b2b_num):
-            code_this = ""
-            N_str = str(self.fuse_gemm_info[i]['mnk'][1])
-            code_this += helper.var_idx(helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + " alpha", i) + " = " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + "(1);\n"
-            beta = "(1)"
-            if helper.get_epilogue_add_bias_or_not( self.fuse_gemm_info[i]) is False:
-                beta = "(0)"
-            code_this += helper.var_idx(helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + " beta", i) + " = " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + beta + ";\n"
-            k_str = str(self.fuse_gemm_info[i]['mnk'][2])
-            if i == 0:
-                k_str = "K0"
-            code_this += helper.var_idx("cutlass::gemm::GemmCoord problem_size_", i) + "(M, " + str(self.fuse_gemm_info[i]['mnk'][1]) + ", " + k_str + ");\n"
-            code_this += helper.var_idx("typename Gemm", i) + helper.var_idx("::Arguments arguments_", i) + "{\n"
-            code_this += "    " + helper.var_idx("problem_size_", i) + ",\n"
-            ldmA = k_str
-            ldmB = k_str
-            ldmC = str(self.fuse_gemm_info[i]['mnk'][1])
-            ldmBias = str(helper.get_epilogue_bias_ldm(self.fuse_gemm_info[i]))
-            if self.fuse_gemm_info[i]['A_format'] is 'Col':
-                ldmA = "M"
-            if self.fuse_gemm_info[i]['B_format'] is 'Row':
-                ldmB = str(self.fuse_gemm_info[i]['mnk'][1])
-            if self.fuse_gemm_info[i]['C_format'] is 'Col':
-                ldmC = "M"
-            if i == 0:
-                code_this += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_tp']) + "*>(" + helper.var_idx("A", i) + "), " + ldmA + "}, " + "M * " + ldmA + ",\n"
-            else:
-                code_this += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['A_tp']) + "*>(" + helper.var_idx("D", i - 1) + "), " + ldmA + "}, " + "M * " + ldmA + ",\n"
-            code_this += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['B_tp']) + "*>(" + helper.var_idx("B", i) + "), " + ldmB + "}, " + N_str + " * " + ldmB + ",\n"
-            M_bias = str(helper.get_epilogue_bias_shape(self.fuse_gemm_info[i])[0])
-            code_this += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + "*>(" + helper.var_idx("C", i) + "), " + ldmBias + "}, " + M_bias + " * " + N_str + ",\n"
-            code_this += "    " + "{reinterpret_cast<" + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['C_tp']) + "*>(" + helper.var_idx("D", i) + "), " + ldmC + "}, " + "M * " + ldmC + ",\n"
-            code_this += "    " + "{ " + helper.var_idx("alpha", i) + ", " + helper.var_idx("beta", i)
-            for epilogue_arg in  helper.get_epilogue_args(self.fuse_gemm_info[i]):
-                arg_name = helper.var_idx("Epilogue", i) + "_" +  epilogue_arg[1]
-                code_this += ", " + helper.type_2_cutlass_type(self.fuse_gemm_info[i]['Acc_tp']) + "(" + str(arg_name) + ")"
-            code_this += " },\n"
-            code_this += "    " + "Batch};\n"
-            code_this += "    " + helper.var_idx("Gemm", i) + helper.var_idx(" gemm_op_", i) + ";\n"
-            code_this += "    " + helper.var_idx("gemm_op_", i) + helper.var_idx(".initialize(arguments_", i) + ", nullptr);\n"
-            code += code_this + "\n"
-        return code + "\n"
-    def gen_run(self):
-        code = ""
-        for i in range(self.b2b_num):
-            code_this = ""
-            code_this += "    " + helper.var_idx("gemm_op_", i) + "(stream);\n"
-            code += code_this
-        return code
-    def gen_wrapper(self):
-        code_body = ""
-        arg_lists = []
-        arg_lists.append(["int", "M"])
-        arg_lists.append(["int", "K0"])
-        arg_lists.append(["int", "Batch"])
-        arg_lists.append(["void*", helper.var_idx("A", 0)])
-        for i in range(self.b2b_num):
-            arg_lists.append(["void*", helper.var_idx("B", i)])
-            arg_lists.append(["void*", helper.var_idx("C", i)])
-            arg_lists.append(["void*", helper.var_idx("D", i)])
-            epilogue_args = helper.get_epilogue_args(self.fuse_gemm_info[i])
-            acc_tp = helper.get_epilogue_compute_tp(self.fuse_gemm_info[i])
-            for arg in epilogue_args:
-                arg_tp = arg[0]
-                arg_name = helper.var_idx("Epilogue", i) + "_" +  arg[1]
-                arg_lists.append([arg_tp, arg_name])
-        code_body += self.gen_using()
-        code_body += self.gen_initialize()
-        code_body += self.gen_run()
-        code = ir.gen_func(self.gen_class_name, arg_lists, code_body)
-        return code
-    def gen_code(self):
-        code = self.gen_wrapper()
-        helper.write_2_headfile("volta_impl.h", self.output_dir, self.user_header_file + "\n" +  code)
-class gen_one_API:
-    def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, output_dir = "../"):
-        self.fuse_gemm_info = fuse_gemm_info
-        self.gen_class_name = gen_class_name
-        self.user_header_file = ""
-        for header in user_header_file:
-            self.user_header_file += "#include \"" + header + "\"\n"
-        self.output_dir = output_dir
-        self.b2b_num = len(fuse_gemm_info)
-        self.gen_volta = gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
-        self.gen_turing = gen_turing_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
-    def gen_CUTLASS_irrelevant_API(self):
-        code = ""
-        code += "#include <cuda_runtime.h>\n"
-        code += "#include <cassert>\n"
-        param_name = "Fused" + str(self.b2b_num) + "xGemm_"
-        for i in range(self.b2b_num):
-            param_name += str(self.fuse_gemm_info[i]['mnk'][1]) + "_"
-        param_name += "Params"
-        params = ""
-        params += "    " + "int M;\n"
-        params += "    " + "int K0;\n"
-        params += "    " + "int Batch;\n"
-        params += "    " + "const void* A0;\n"
-        for i in range(self.b2b_num):
-            params += "    " + "const void* " + helper.var_idx("B", i) + ";\n"
-            params += "    " + "const void* " + helper.var_idx("C", i) + ";\n"
-            epilogue_args = helper.get_epilogue_args(self.fuse_gemm_info[i])
-            acc_tp = helper.get_epilogue_compute_tp(self.fuse_gemm_info[i])
-            for arg in epilogue_args:
-                arg_tp = arg[0]
-                arg_name = helper.var_idx("Epilogue", i) + "_" +  arg[1]
-                params += "    " + arg_tp + " " + arg_name + ";\n"
-            params += "    " + "void* " + helper.var_idx("D", i) + ";\n"
-        code += ir.gen_struct(param_name, params)
-        code += "using Param = " + param_name + ";\n"
-        code += "void one_api( const  Param & param, int sm, cudaStream_t stream);\n"
-        return code
-    def gen_one_api(self):
-        code = ""
-        code += "/* Auto Generated code - Do not edit.*/\n"
-        code += "#include \"cutlass_irrelevant.h\"\n"
-        code += "#include \"api.h\"\n"
-        code += "void one_api( const  Param & param, int sm, cudaStream_t stream) {\n"
-        code += "    " + "if (sm == 70) \n"
-        code += "    " + "    " + self.gen_class_name + "_volta_impl(param.M, param.K0, param.Batch, const_cast<void*>(param.A0), "
-        for i in range(self.b2b_num):
-            code += helper.var_idx("const_cast<void*>(param.B", i) + "), "
-            code += helper.var_idx("const_cast<void*>(param.C", i) + "), "
-            code += helper.var_idx("param.D", i) + ", "
-            epilogue_args = helper.get_epilogue_args(self.fuse_gemm_info[i])
-            for arg in epilogue_args:
-                arg_name = helper.var_idx("Epilogue", i) + "_" +  arg[1]
-                code += "param." + arg_name + ", "
-        code += "stream);\n"
-        code += "    " + "else if(sm >= 75) \n"
-        code += "    " + "    " + self.gen_class_name + "_turing_impl(param.M, param.K0, param.Batch, const_cast<void*>(param.A0), "
-        for i in range(self.b2b_num):
-            code += helper.var_idx("const_cast<void*>(param.B", i) + "), "
-            code += helper.var_idx("const_cast<void*>(param.C", i) + "), "
-            code += helper.var_idx("param.D", i) + ", "
-            epilogue_args = helper.get_epilogue_args(self.fuse_gemm_info[i])
-            for arg in epilogue_args:
-                arg_name = helper.var_idx("Epilogue", i) + "_" +  arg[1]
-                code += "param." + arg_name + ", "
-        code += "stream);\n"
-        code += "    " + "else assert(0);\n"
-        code += "}\n"
-        return code
-    def gen_code(self):
-        turing_code = self.gen_turing.gen_wrapper()
-        volta_code = self.gen_volta.gen_wrapper()
-        cutlass_irrelevant_code = self.gen_CUTLASS_irrelevant_API()
-        one_api_code = self.gen_one_api()
-        with open(self.output_dir + "one_api.cu", "w+") as f:
-            f.write(one_api_code)
-        helper.write_2_headfile("cutlass_irrelevant.h", self.output_dir, cutlass_irrelevant_code)
-        helper.write_2_headfile("api.h", self.output_dir, self.user_header_file + "\n" +  turing_code + volta_code)

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py DELETED Viewed

@@ -1,92 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-import helper
-import gen_ir as ir
-import gen_turing_and_volta as gen_basic
-class gen_verify:
-    def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, output_dir = "../"):
-        self.fuse_gemm_info = fuse_gemm_info
-        self.name = gen_class_name + "_verify"
-        self.b2b_num = len(fuse_gemm_info)
-        self.params = []
-        self.user_header_file = ""
-        for header in user_header_file:
-            self.user_header_file += "#include \"" + header + "\"\n"
-        self.separate_cutlass = gen_basic.gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir)
-        self.gen_params()
-        self.output_dir = output_dir
-    def gen_code(self):
-        code = ""
-        code += self.user_header_file
-        code += self.separate_cutlass.gen_using(False)  #False -> Turing, True -> Volta
-        code_body = ""
-        for i in range(self.b2b_num):
-            code_body += "    " + helper.var_idx("Gemm", i) + helper.var_idx(" gemm_op_", i) + ";\n"
-            code_body += "    " + helper.var_idx("gemm_op_", i) + helper.var_idx(".initialize(Arguments_", i) + ", nullptr);\n"
-        code_body += self.separate_cutlass.gen_run()
-        code += ir.gen_func(self.name, self.params, code_body)
-        helper.write_2_headfile("cutlass_verify.h", self.output_dir, code)
-    def gen_params(self):
-        for i in range(self.b2b_num):
-            self.params.append(
-                (
-                    helper.var_idx("typename Gemm", i)+ "::Arguments",
-                    helper.var_idx("Arguments_", i)
-                )
-            )
-    def get_params(self, declaration = True):
-        code = ""
-        if declaration:
-            for param in self.params:
-                code += param[0] + " " + param[1] + ";\n"
-        return code
-    def gen_initialize():
-        code = ""
-        initialize_code = self.separate_cutlass.gen_initialize()
-        code = ir.gen_func("initialize", [[]])

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py DELETED Viewed

@@ -1,135 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-def type_2_cutlass_type(input_type = "fp16"):
-    # float point type
-    if input_type == "fp32":
-        return "float"
-    if input_type == "bf16":
-        return "cutlass::bfloat16_t"
-    if input_type == "fp16":
-        return "cutlass::half_t"
-    # integer type
-    if(input_type == "int32"):
-        return "int32_t"
-    if(input_type == "int8"):
-        return "int8_t"
-    if input_type == 'Row':
-        return 'cutlass::layout::RowMajor'
-    if input_type == 'Col':
-        return 'cutlass::layout::ColumnMajor'
-def cvt_2_cutlass_shape(gemm_shape):
-    # gemm shape
-    if len(gemm_shape) == 3:
-        val = "cutlass::gemm::GemmShape<"  \
-                                        + str(gemm_shape[0]) + ", " \
-                                        + str(gemm_shape[1]) + ", " \
-                                        + str(gemm_shape[2]) + ">"
-        return val
-def write_2_headfile(filename, file_dir, string):
-    with open(file_dir + filename, 'w') as f:
-        f.write("/* Auto Generated code - Do not edit.*/\n\n\n#pragma once\n" + string)
-def var_idx(variable, index):
-    return variable + str(index)
-def list_2_string(input_list, ):
-    rtn_string = ""
-    cnt = 0
-    for element in input_list:
-        final = ", \n"
-        if cnt == len(input_list) - 1:
-            final = "\n"
-        cnt += 1
-        rtn_string += str(element) + final
-    return rtn_string
-def get_epilogue_info(layer_info):
-    return layer_info['epilogue']
-def get_epilogue_tp(layer_info):
-    epilogue_info = get_epilogue_info(layer_info)
-    return epilogue_info['tp']
-def get_epilogue_add_bias_or_not(layer_info):
-    epilogue_info = get_epilogue_info(layer_info)
-    return epilogue_info['bias']['addbias']
-def get_epilogue_add_bias_tp(layer_info):
-    epilogue_info = get_epilogue_info(layer_info)
-    return epilogue_info['bias']['bias_tp']
-def get_epilogue_args(layer_info):
-    epilogue_info = get_epilogue_info(layer_info)
-    return epilogue_info['args']
-def get_epilogue_bias_shape(layer_info):
-    bias_tp = get_epilogue_add_bias_tp(layer_info).lower()
-    mn_shape = layer_info['mnk'][:-1]
-    if bias_tp == 'mat':
-        mn_shape[0] = 'M'
-        return mn_shape
-    elif bias_tp == 'vec':
-        mn_shape[0] = 1
-        return mn_shape
-    else:
-        assert(0)
-def get_epilogue_bias_ldm(layer_info):
-    bias_tp = get_epilogue_add_bias_tp(layer_info).lower()
-    mn_shape = layer_info['mnk'][:-1]
-    c_layout = layer_info['C_format'].lower()
-    if c_layout != 'row':
-        assert(0)
-    if bias_tp == 'mat':
-        return mn_shape[1]
-    elif bias_tp == 'vec':
-        return 0
-    else:
-        assert(0)
-def get_epilogue_compute_tp(layer_info):
-    return layer_info['Acc_tp']

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/ir_gen/replace_fix_impl_header.py DELETED Viewed

@@ -1,67 +0,0 @@
-#################################################################################################
-#
-# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-#################################################################################################
-import os
-class replace_fix_impl:
-    def __init__(self, src_dir, dst_dir, cutlass_deps_root):
-        self.src_dir = src_dir
-        self.dst_dir = dst_dir
-        self.cutlass_deps_root = cutlass_deps_root
-    def gen_code(self):
-        for sub_dir in os.walk(self.src_dir):
-            files_in_sub_dir = sub_dir[2]
-            src_dirs = sub_dir[0]
-            output_dirs = self.dst_dir + sub_dir[0][len(self.src_dir):]
-            if not os.path.exists(output_dirs):
-                os.mkdir(output_dirs)
-            for f in files_in_sub_dir:
-                with open(src_dirs +"/" + f, 'r') as current_file:
-                    output_lines = []
-                    lines = current_file.readlines()
-                    for line in lines:
-                        if(len(line) >= len("#include \"cutlass") and line[:len("#include \"cutlass")] == "#include \"cutlass"):
-                            new_line = "#include \"" + self.cutlass_deps_root + line[len("#include \""):]
-                            # print(new_line)
-                            output_lines.append(new_line)
-                        else:
-                            output_lines.append(line)
-                    with open(output_dirs + "/"  + f, "w+") as dest_file:
-                        dest_file.writelines(output_lines)

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/leaky_bias.h DELETED Viewed

@@ -1,292 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include <cuda_fp16.h>
-template <typename T>
-__device__
-T add(T const & a, T const &b){
-    return  (a + b);
-}
-template <>
-__device__
-half2 add(half2 const & a, half2 const &b){
-    return (__hadd2(a,b));
-}
-template <typename T>
-struct RELU{
-    __device__
-    T operator()(T const & a){
-        return  a > T(0) ? a : T(0);
-    }
-    __device__
-    half2 operator()(half2 const & a){
-        float2 a_fp32x2 = __half22float2(a);
-        a_fp32x2.x = a_fp32x2.x > 0.f ? a_fp32x2.x : 0.f;
-        a_fp32x2.y = a_fp32x2.y > 0.f ? a_fp32x2.y : 0.f;
-        if(a_fp32x2.x < 0.f || a_fp32x2.y < 0.f)
-        printf(" %f %f\n", a_fp32x2.x ,a_fp32x2.y);
-        return __float22half2_rn(a_fp32x2);
-    }
-};
-template <typename T>
-struct LEAKY_RELU{
-    __device__
-    T operator()(T const & a, T const & scale = half(1)){
-        return  a > T(0) ? a : scale * a;
-    }
-    __device__
-    half2 operator()(half2 const & a, half const & scale = half(1)){
-        half2 zero = __half2half2(half(0));
-        half2 gt_zero = __hge2(a, zero);
-        half2 le_zero = __hle2(a, zero);
-        half2 scale_f16x2 = __half2half2(scale);
-        half2 mask_scale_f16x2 = __hfma2(le_zero, scale_f16x2, gt_zero);
-        return __hmul2(a, mask_scale_f16x2);
-    }
-};
-template <int N, int BLOCKDIM>
-__global__ void leaky_and_activation(half* inout, half* bias, half scale, bool mat_bias){
-    constexpr bool N_MOD_2 = N & 1 ? false : true;
-    using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
-    constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
-    constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
-    LEAKY_RELU<half> Act;
-    Access_tp src_v[iter];
-    Access_tp bias_v[iter];
-    int batch_id = blockIdx.y;
-    int batch_offset = batch_id * gridDim.x * N;
-    for(int i = 0; i < iter; i++){
-        int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
-        if (idx < N){
-            src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
-            if (mat_bias)
-                bias_v[i] = *reinterpret_cast<Access_tp*>(bias + blockIdx.x * N + idx + batch_offset);
-            else
-                bias_v[i] = *reinterpret_cast<Access_tp*>(bias + idx + batch_id * N);
-            *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(add(src_v[i],bias_v[i]),scale);
-        }
-    }
-}
-template <int N, int BLOCKDIM>
-__global__ void leaky_and_activation(half* inout, half scale){
-    constexpr bool N_MOD_2 = N & 1 ? false : true;
-    using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
-    constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
-    constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
-    int batch_id = blockIdx.y;
-    int batch_offset = batch_id * gridDim.x * N;
-    LEAKY_RELU<half> Act;
-    Access_tp src_v[iter];
-    for(int i = 0; i < iter; i++){
-        int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
-        if (idx < N){
-            src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
-            *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(src_v[i], scale);
-        }
-    }
-}
-template <int N, int BLOCKDIM>
-void leaky_and_activation(half* inout, half* bias, int m, int b, half scale, bool mat_bias){
-    dim3 grid(m, b);
-    if (bias == nullptr)
-        leaky_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, scale);
-    else
-        leaky_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, bias, scale, mat_bias);
-}
-template <int N, int BLOCKDIM>
-__global__ void relu_and_activation(half* inout, half* bias, bool mat_bias){
-    constexpr bool N_MOD_2 = N & 1 ? false : true;
-    using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
-    constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
-    constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
-    RELU<half> Act;
-    Access_tp src_v[iter];
-    Access_tp bias_v[iter];
-    int batch_id = blockIdx.y;
-    int batch_offset = batch_id * gridDim.x * N;
-    for(int i = 0; i < iter; i++){
-        int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
-        if (idx < N){
-            src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
-            if (mat_bias)
-                bias_v[i] = *reinterpret_cast<Access_tp*>(bias + blockIdx.x * N + idx + batch_offset);
-            else
-                bias_v[i] = *reinterpret_cast<Access_tp*>(bias + idx + batch_id * N);
-            *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(add(src_v[i],bias_v[i]));
-        }
-    }
-}
-template <int N, int BLOCKDIM>
-__global__ void relu_and_activation(half* inout){
-    constexpr bool N_MOD_2 = N & 1 ? false : true;
-    using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
-    constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
-    constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
-    int batch_id = blockIdx.y;
-    int batch_offset = batch_id * gridDim.x * N;
-    RELU<half> Act;
-    Access_tp src_v[iter];
-    for(int i = 0; i < iter; i++){
-        int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
-        if (idx < N){
-            src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
-            *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = Act(src_v[i]);
-        }
-    }
-}
-template <int N, int BLOCKDIM>
-void relu_and_activation(half* inout, half* bias, int m, int b, bool mat_bias){
-    dim3 grid(m, b);
-    if (bias == nullptr)
-        relu_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout);
-    else
-        relu_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, bias, mat_bias);
-}
-template <int N, int BLOCKDIM>
-__global__ void identity_and_activation(half* inout, half* bias, bool mat_bias){
-    constexpr bool N_MOD_2 = N & 1 ? false : true;
-    using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
-    constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
-    constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
-    int batch_id = blockIdx.y;
-    int batch_offset = batch_id * gridDim.x * N;
-    Access_tp src_v[iter];
-    Access_tp bias_v[iter];
-    for(int i = 0; i < iter; i++){
-        int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
-        if (idx < N){
-            src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
-            if (mat_bias)
-                bias_v[i] = *reinterpret_cast<Access_tp*>(bias + blockIdx.x * N + idx + batch_offset);
-            else
-                bias_v[i] = *reinterpret_cast<Access_tp*>(bias + idx + batch_id * N);
-            *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = (add(src_v[i],bias_v[i]));
-        }
-    }
-}
-template <int N, int BLOCKDIM>
-__global__ void identity_and_activation(half* inout){
-    constexpr bool N_MOD_2 = N & 1 ? false : true;
-    using Access_tp = typename std::conditional<N_MOD_2, half2, half>::type;
-    constexpr int Access_elements = sizeof(Access_tp) / sizeof(half);
-    constexpr int iter = (N + (BLOCKDIM * Access_elements) - 1 ) / (BLOCKDIM * Access_elements);
-    int batch_id = blockIdx.y;
-    int batch_offset = batch_id * gridDim.x * N;
-    Access_tp src_v[iter];
-    for(int i = 0; i < iter; i++){
-        int idx = (i * BLOCKDIM + threadIdx.x) * Access_elements;
-        if (idx < N){
-            src_v[i] = *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset);
-            *reinterpret_cast<Access_tp*>(inout + blockIdx.x * N + idx + batch_offset) = (src_v[i]);
-        }
-    }
-}
-template <int N, int BLOCKDIM>
-void identity_and_activation(half* inout, half* bias, int m, int b, bool mat_bias){
-    dim3 grid(m, b);
-    if (bias == nullptr)
-        identity_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout);
-    else
-        identity_and_activation<N, BLOCKDIM><<<grid , BLOCKDIM>>>(inout, bias, mat_bias);
-}

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/44_multi_gemm_ir_and_codegen/utils.h DELETED Viewed

@@ -1,94 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#define TI(tag) \
-    cudaEvent_t _event_start_ ##tag; \
-    cudaEvent_t _event_end_ ##tag; \
-    float _event_time_ ##tag; \
-    cudaEventCreate(& _event_start_ ##tag); \
-    cudaEventCreate(& _event_end_ ##tag); \
-    cudaEventRecord(_event_start_ ##tag);
-#define TO(tag, str, times) \
-    cudaEventRecord(_event_end_ ##tag); \
-    cudaEventSynchronize(_event_end_ ##tag); \
-    cudaEventElapsedTime(&_event_time_ ##tag, _event_start_ ##tag, _event_end_ ##tag); \
-    float _event_time_once_ ##tag = _event_time_ ##tag / times; \
-    printf("%20s:\t %10.3fus\t", str, _event_time_once_ ##tag * 1000); \
-    cudaDeviceSynchronize(); \
-    printf("%20s string: %s\n",str, cudaGetErrorString(cudaGetLastError()));
-template<typename T>
-struct memory_unit{
-    T* host_ptr;
-    T* device_ptr;
-    int size_bytes;
-    int elements;
-    void h2d(){
-        cudaMemcpy(device_ptr, host_ptr, size_bytes, cudaMemcpyHostToDevice);
-    }
-    void d2h(){
-        cudaMemcpy(host_ptr, device_ptr, size_bytes, cudaMemcpyDeviceToHost);
-    }
-    void free_all(){
-        free(host_ptr);
-        cudaFree(device_ptr);
-    }
-    memory_unit(int elements_): size_bytes(elements_ * sizeof(T)), elements(elements_){
-        host_ptr = (T*) malloc(elements_ * sizeof(T));
-        cudaMalloc((void**)&device_ptr, elements_ * sizeof(T));
-    }
-    void init(int abs_range = 1){
-        for(int i = 0; i < elements; i++){
-            host_ptr[i] = T(rand() % 100 / float(100)  * 2 * abs_range - abs_range);
-        }
-        h2d();
-    }
-};
-template<typename T>
-int check_result(T * a, T * b, int N){
-    int cnt = 0;
-    for(int i = 0; i < N; i ++){
-        float std = float(a[i]);
-        float my = float(b[i]);
-        if(abs(std - my) / abs(std) > 1e-2)
-        {
-            // printf("my: %f , std: %f\n", my, std);
-            cnt++;
-        }
-    }
-    printf("total err: %d / %d\n", cnt, N);
-    return cnt;
-}

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/device/dual_gemm.h DELETED Viewed

@@ -1,499 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Performs a dual gemm in one fused kernel:
-```
-D0 = epilogue0(X @ B0, C0)
-D1 = epilogue1(X @ B1, C1)
-D2 = element_wise(D0, D1)
-```
-*/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/device_kernel.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/epilogue/thread/linear_combination_relu.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "../kernel/dual_gemm.h"
-#include "../dual_gemm_common.h"
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace gemm {
-namespace device {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-    /// Element type for A matrix operand
-    typename ElementA_,
-    /// Layout type for A matrix operand
-    typename LayoutA_,
-    /// Element type for B matrix operand
-    typename ElementB_,
-    /// Layout type for B0 matrix operand
-    typename LayoutB0_,
-    /// Layout type for B1 matrix operand
-    typename LayoutB1_,
-    /// Element type for C and D matrix operands
-    typename ElementC_,
-    /// Layout type for C and D matrix operands
-    typename LayoutC_,
-    /// Element type for internal accumulation
-    typename ElementAccumulator_,
-    /// Operator class tag
-    typename OperatorClass_,
-    /// Tag indicating architecture to tune for
-    typename ArchTag_,
-    /// Threadblock-level tile size (concept: GemmShape)
-    typename ThreadblockShape_,
-    /// Warp-level tile size (concept: GemmShape)
-    typename WarpShape_,
-    /// Instruction-level tile size (concept: GemmShape)
-    typename InstructionShape_,
-    /// Epilogue output operator
-    typename EpilogueOutputOp0_,
-    typename EpilogueOutputOp1_,
-    typename EpilogueOutputOp2_,
-    /// Threadblock-level swizzling operator
-    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
-    /// Number of stages used in the pipelined mainloop
-    int Stages =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kStages,
-    bool StoreD0 = true,
-    bool StoreD1 = true,
-    /// If true, kernel supports split-K with serial reduction
-    bool SplitKSerial = false,
-    /// Access granularity of A matrix in units of elements
-    int AlignmentA =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentA,
-    /// Access granularity of B matrix in units of elements
-    int AlignmentB =
-        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
-                                 ElementC_, ElementAccumulator_>::kAlignmentB,
-    /// Operation performed by GEMM
-    typename Operator_ = typename DefaultGemmConfiguration<
-        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator>
-class DualGemm {
- public:
-  using ElementA = ElementA_;
-  using LayoutA = LayoutA_;
-  using TensorRefA = TensorRef<ElementA const, LayoutA>;
-  using ElementB = ElementB_;
-  using LayoutB0 = LayoutB0_;
-  using LayoutB1 = LayoutB1_;
-  using TensorRefB0 = TensorRef<ElementB const, LayoutB0>;
-  using TensorRefB1 = TensorRef<ElementB const, LayoutB1>;
-  using ElementC = ElementC_;
-  using LayoutC = LayoutC_;
-  using TensorRefC = TensorRef<ElementC const, LayoutC>;
-  using TensorRefD = TensorRef<ElementC, LayoutC>;
-  using ElementAccumulator = ElementAccumulator_;
-  using OperatorClass = OperatorClass_;
-  using ArchTag = ArchTag_;
-  using ThreadblockShape = ThreadblockShape_;
-  using WarpShape = WarpShape_;
-  using InstructionShape = InstructionShape_;
-  using EpilogueOutputOp0 = EpilogueOutputOp0_;
-  using EpilogueOutputOp1 = EpilogueOutputOp1_;
-  using EpilogueOutputOp2 = EpilogueOutputOp2_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  using Operator = Operator_;
-  static int const kStages = Stages;
-  static int const kAlignmentA = AlignmentA;
-  static int const kAlignmentB = AlignmentB;
-  static int const kAlignmentC = EpilogueOutputOp1::kCount;
-  static bool const kSplitKSerial = SplitKSerial;
-  static bool constexpr kStoreD0 = StoreD0;
-  static bool constexpr kStoreD1 = StoreD1;
-  static ComplexTransform const kTransformA = ComplexTransform::kNone;
-  static ComplexTransform const kTransformB = ComplexTransform::kNone;
-  using LayoutScaleBias = layout::RowMajor;
-  /// Define the kernel
-  /// Define the threadblock-scoped matrix multiply-accumulate
-  static_assert(ArchTag::kMinComputeCapability >= 80, "Only multistage is implemented");
-  static_assert(kStages >= 3, "Only multistage is implemented");
-  using Mma0 = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB0, kAlignmentB,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
-      ThreadblockShape, WarpShape,
-      InstructionShape, Stages, Operator>::ThreadblockMma;
-  using Mma1 = typename cutlass::gemm::threadblock::DefaultMma<
-      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB1, kAlignmentB,
-      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, ArchTag,
-      ThreadblockShape, WarpShape,
-      InstructionShape, Stages, Operator>::ThreadblockMma;
-  using DualMma = threadblock::DualMmaMultistage<
-    typename Mma0::Shape,
-    typename Mma0::IteratorA,
-    typename Mma0::SmemIteratorA,
-    Mma0::kCacheOpA,
-    typename Mma0::IteratorB,
-    typename Mma0::SmemIteratorB,
-    Mma0::kCacheOpB,
-    typename Mma1::IteratorB,
-    typename Mma1::SmemIteratorB,
-    typename Mma0::ElementC,
-    typename Mma0::LayoutC,
-    typename Mma0::Policy,
-    typename Mma1::Policy,
-    Mma0::kStages,
-    SharedMemoryClearOption::kNone
-  >;
-  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
-  /// Define the epilogue
-  using Epilogue0 =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename DualMma::Operator0, kPartitionsK, EpilogueOutputOp0,
-          EpilogueOutputOp0::kCount>::Epilogue;
-  using Epilogue1 =
-      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
-          ThreadblockShape, typename DualMma::Operator1, kPartitionsK, EpilogueOutputOp1,
-          EpilogueOutputOp1::kCount>::Epilogue;
-  /// Define the kernel-level GEMM operator.
-  using DualGemmKernel = kernel::DualGemm<
-    DualMma,
-    Epilogue0, Epilogue1, EpilogueOutputOp2,
-    ThreadblockSwizzle, kSplitKSerial,
-    kStoreD0, kStoreD1>;
-  /// Argument structure
-  struct Arguments {
-    //
-    // Data members
-    //
-    DualGemmMode mode;
-    GemmCoord problem_size;
-    TensorRef<ElementA const, LayoutA> ref_A0;
-    TensorRef<ElementB const, LayoutB0> ref_B0;
-    TensorRef<ElementC const, LayoutC> ref_C0;
-    TensorRef<ElementC, LayoutC> ref_D0;
-    TensorRef<ElementB const, LayoutB1> ref_B1;
-    TensorRef<ElementC const, LayoutC> ref_C1;
-    TensorRef<ElementC, LayoutC> ref_D1;
-    TensorRef<ElementC, LayoutC> ref_D2;
-    typename EpilogueOutputOp0::Params epilogue0;
-    typename EpilogueOutputOp1::Params epilogue1;
-    typename EpilogueOutputOp2::Params epilogue2;
-    int split_k_slices;
-    int batch_count;
-    int64_t batch_stride_A;
-    int64_t batch_stride_B0;
-    int64_t batch_stride_B1;
-    int64_t batch_stride_C;
-    int64_t batch_stride_D;
-    //
-    // Methods
-    //
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
-    }
-    /// Constructs an Arguments structure
-    CUTLASS_HOST_DEVICE
-    Arguments(
-      DualGemmMode mode,
-      GemmCoord problem_size_,
-      TensorRef<ElementA const, LayoutA> ref_A0_,
-      TensorRef<ElementB const, LayoutB0> ref_B0_,
-      TensorRef<ElementC const, LayoutC> ref_C0_,
-      TensorRef<ElementC, LayoutC> ref_D0_,
-      TensorRef<ElementB const, LayoutB1> ref_B1_,
-      TensorRef<ElementC const, LayoutC> ref_C1_,
-      TensorRef<ElementC, LayoutC> ref_D1_,
-      TensorRef<ElementC, LayoutC> ref_D2_,
-      typename EpilogueOutputOp0::Params epilogue0_ =
-        typename EpilogueOutputOp0::Params(),
-      typename EpilogueOutputOp1::Params epilogue1_ =
-        typename EpilogueOutputOp1::Params(),
-      typename EpilogueOutputOp2::Params epilogue2_ =
-        typename EpilogueOutputOp2::Params(),
-      int split_k_slices_ = 1,
-      int batch_count = 1,
-      int64_t batch_stride_A = 0,
-      int64_t batch_stride_B0 = 0,
-      int64_t batch_stride_B1 = 0,
-      int64_t batch_stride_C = 0,
-      int64_t batch_stride_D = 0
-    ):
-      mode(mode),
-      problem_size(problem_size_),
-      ref_A0(ref_A0_),
-      ref_B0(ref_B0_),
-      ref_C0(ref_C0_),
-      ref_D0(ref_D0_),
-      ref_B1(ref_B1_),
-      ref_C1(ref_C1_),
-      ref_D1(ref_D1_),
-      ref_D2(ref_D2_),
-      epilogue0(epilogue0_),
-      epilogue1(epilogue1_),
-      epilogue2(epilogue2_),
-      split_k_slices(split_k_slices_),
-      batch_count(batch_count),
-      batch_stride_A(batch_stride_A),
-      batch_stride_B0(batch_stride_B0),
-      batch_stride_B1(batch_stride_B1),
-      batch_stride_C(batch_stride_C),
-      batch_stride_D(batch_stride_D) {
-    }
-  };
-private:
-  /// Kernel parameters object
-  typename DualGemmKernel::Params params_;
-public:
-  /// Constructs the GEMM.
-  DualGemm() = default;
-  /// Determines whether the GEMM can execute the given problem.
-  static Status can_implement(Arguments const &args) {
-    if (args.mode == DualGemmMode::kBatched && kSplitKSerial) {
-      return Status::kErrorInvalidProblem;
-    }
-    if (!kSplitKSerial && args.split_k_slices > 1) {
-      return Status::kErrorInvalidProblem;
-    }
-    if (kStoreD0 != (args.ref_D0.data() != nullptr)) {
-      return Status::kErrorInternal;
-    }
-    if (kStoreD1 != (args.ref_D1.data() != nullptr)) {
-      return Status::kErrorInternal;
-    }
-    Status status = DualGemmKernel::can_implement(
-      args.problem_size,
-      args.ref_A0.non_const_ref(),
-      args.ref_B0.non_const_ref(),
-      args.ref_C0.non_const_ref(),
-      args.ref_D0,
-      args.ref_B1.non_const_ref(),
-      args.ref_C1.non_const_ref(),
-      args.ref_D1,
-      args.ref_D2
-    );
-    if (status != Status::kSuccess) {
-      return status;
-    }
-    return Status::kSuccess;
-  }
-  /// Gets the workspace size
-  static size_t get_workspace_size(Arguments const &args) {
-    size_t bytes = 0;
-    if (kSplitKSerial && args.split_k_slices > 1) {
-      // Determine grid shape
-      ThreadblockSwizzle threadblock_swizzle;
-      cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
-        args.problem_size,
-        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-        args.split_k_slices);
-      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
-    }
-    return bytes;
-  }
-  /// Initializes GEMM state from arguments.
-  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
-    // Determine grid shape
-    ThreadblockSwizzle threadblock_swizzle;
-    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
-      args.problem_size,
-      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
-      args.mode == DualGemmMode::kBatched ? args.batch_count : args.split_k_slices);
-    if (kSplitKSerial) {
-      if (args.split_k_slices > 1) {
-        if (!workspace) {
-          return Status::kErrorWorkspaceNull;
-        }
-        size_t bytes = get_workspace_size(args);
-        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
-        if (result != cudaSuccess) {
-          return Status::kErrorInternal;
-        }
-      }
-    }
-    else {
-      if (args.split_k_slices > 1) {
-        return Status::kErrorInvalidProblem;
-      }
-    }
-    // Initialize the Params structure
-    params_ = typename DualGemmKernel::Params{
-      args.mode,
-      args.problem_size,
-      grid_shape,
-      args.ref_A0.non_const_ref(),
-      args.ref_B0.non_const_ref(),
-      args.ref_C0.non_const_ref(),
-      args.ref_D0,
-      args.ref_B1.non_const_ref(),
-      args.ref_C1.non_const_ref(),
-      args.ref_D1,
-      args.ref_D2,
-      args.epilogue0,
-      args.epilogue1,
-      args.epilogue2,
-      reinterpret_cast<int *>(workspace),
-      args.batch_stride_A,
-      args.batch_stride_B0,
-      args.batch_stride_B1,
-      args.batch_stride_C,
-      args.batch_stride_D,
-    };
-    return Status::kSuccess;
-  }
-  /// Lightweight update given a subset of arguments
-  Status update(Arguments const &args, void *workspace = nullptr) {
-    if (kSplitKSerial && args.split_k_slices > 1) {
-      if (!workspace) {
-        return Status::kErrorWorkspaceNull;
-      }
-    }
-    params_.ref_A0.reset(args.ref_A0.non_const_ref().data());
-    params_.ref_B0.reset(args.ref_B0.non_const_ref().data());
-    params_.ref_C0.reset(args.ref_C0.non_const_ref().data());
-    params_.ref_D0.reset(args.ref_D0.data());
-    params_.ref_B1.reset(args.ref_B1.non_const_ref().data());
-    params_.ref_C1.reset(args.ref_C1.non_const_ref().data());
-    params_.ref_D1.reset(args.ref_D1.data());
-    params_.ref_D2.reset(args.ref_D2.data());
-    params_.output_op_0 = args.epilogue0;
-    params_.output_op_1 = args.epilogue1;
-    params_.output_op_2 = args.epilogue2;
-    params_.semaphore = reinterpret_cast<int *>(workspace);
-    return Status::kSuccess;
-  }
-  /// Runs the kernel using initialized state.
-  Status run(cudaStream_t stream = nullptr) {
-    ThreadblockSwizzle threadblock_swizzle;
-    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
-    dim3 block(DualGemmKernel::kThreadCount, 1, 1);
-    cudaError_t result;
-    int smem_size = int(sizeof(typename DualGemmKernel::SharedStorage));
-    if (smem_size >= (48 << 10)) {
-      result = cudaFuncSetAttribute(Kernel<DualGemmKernel>,
-                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                    smem_size);
-      if (result != cudaSuccess) {
-        return Status::kErrorInternal;
-      }
-    }
-    cutlass::Kernel<DualGemmKernel><<<grid, block, smem_size, stream>>>(params_);
-    result = cudaGetLastError();
-    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
-  }
-  /// Runs the kernel using initialized state.
-  Status operator()(cudaStream_t stream = nullptr) {
-    return run(stream);
-  }
-  /// Runs the kernel using initialized state.
-  Status operator()(
-    Arguments const &args,
-    void *workspace = nullptr,
-    cudaStream_t stream = nullptr) {
-    Status status = initialize(args, workspace, stream);
-    if (status == Status::kSuccess) {
-      status = run(stream);
-    }
-    return status;
-  }
-};
-} // namespace device
-} // namespace gemm
-} // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/dual_gemm_common.h DELETED Viewed

@@ -1,52 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Defines common types used for all DualGemm operators.
-*/
-#pragma once
-namespace cutlass {
-namespace gemm {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-enum class DualGemmMode {
-  kGemm,
-  kBatched,
-  kInvalid
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace gemm
-} // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/dual_gemm_run.h DELETED Viewed

@@ -1,938 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <type_traits>
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/tensor_view_io.h"
-#include "cutlass/util/distribution.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-#include "cutlass/util/reference/host/tensor_copy.h"
-#include "cutlass/util/reference/host/tensor_compare.h"
-#include "cutlass/util/reference/host/tensor_norm.h"
-#include "cutlass/util/reference/device/gemm.h"
-#include "cutlass/util/reference/device/tensor_relu.h"
-#include "cutlass/platform/platform.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/device/gemm_universal.h"
-#include "dual_gemm_common.h"
-#include "helper.h"
-#define CHECK_GT(val1, val2) \
-    if((val1) <= (val2)) \
-        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n";
-#define CHECK_TRUE(val) \
-    if(!(val)) \
-        std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n";
-template <
-  typename OutputOp,
-  typename Element,
-  typename Layout>
-struct TensorEpilogueForEachFunc {
-  /// View type
-  using TensorView = cutlass::TensorView<Element, Layout>;
-  /// Coordinate in tensor's index space
-  using TensorCoord = typename TensorView::TensorCoord;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    TensorView view_x0;
-    TensorView view_x1;
-    TensorView view_y;
-    OutputOp output_op;
-    //
-    // Methods
-    //
-    Params(
-      TensorView view_x0_ = TensorView(),
-      TensorView view_x1_ = TensorView(),
-      TensorView view_y_ = TensorView(),
-      OutputOp output_op_ = OutputOp(typename OutputOp::Params{})
-    ):
-      view_x0(view_x0_), view_x1(view_x1_), view_y(view_y_), output_op(output_op_) {
-    }
-  };
-  Params params;
-  CUTLASS_DEVICE
-  TensorEpilogueForEachFunc(Params const &params): params(params) {
-  }
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    Element const & x0 = params.view_x0.at(coord);
-    Element const & x1 = params.view_x1.at(coord);
-    Element& y = params.view_y.at(coord);
-    y = params.output_op(x0, x1);
-  }
-};
-template <
-  typename OutputOp,
-  typename Element,
-  typename Layout>
-void TensorEpilogueForEach(
-  cutlass::TensorView<Element, Layout> x0,
-  cutlass::TensorView<Element, Layout> x1,
-  cutlass::TensorView<Element, Layout> y) {
-  using Func = TensorEpilogueForEachFunc<OutputOp, Element, Layout>;
-  using Params = typename Func::Params;
-  cutlass::reference::device::TensorForEach<Func, Layout::kRank, Params>(
-    y.extent(),
-    Params(x0, x1, y)
-  );
-}
-////////////////////////////////////////////////////////////////////////////////
-template <typename Gemm0_, typename Gemm1_>
-struct NonFusedDualGemmRun
-{
-  using Gemm0 = Gemm0_;
-  using Gemm1 = Gemm1_;
-  using ElementAccumulator = typename Gemm0::ElementAccumulator;
-  using ElementCompute = typename Gemm0::GemmKernel::Epilogue::OutputOp::ElementCompute;
-  /// Initialization
-  cutlass::Distribution::Kind init_A;
-  cutlass::Distribution::Kind init_B;
-  cutlass::Distribution::Kind init_C;
-  cutlass::Distribution::Kind init_Bias;
-  uint64_t seed;
-  //
-  // Methods
-  //
-  NonFusedDualGemmRun(
-    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
-    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
-    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
-    cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
-    uint64_t seed_ = 2080
-  ):
-    init_A(init_A_), init_B(init_B_), init_C(init_C_), init_Bias(init_Bias_), seed(seed_) { }
-  /// Helper to initialize a tensor view
-  template <typename Element, typename Layout>
-  bool initialize_tensor(
-    cutlass::TensorView<Element, Layout> view,
-    cutlass::Distribution::Kind dist_kind,
-    uint64_t seed) {
-    if (dist_kind == cutlass::Distribution::Uniform) {
-      cutlass::reference::host::TensorFillRandomUniform(
-        view, seed, 2, -2, 0);
-    }
-    else if (dist_kind == cutlass::Distribution::Identity) {
-      cutlass::reference::host::TensorFillIdentity(view);
-    }
-    else if (dist_kind == cutlass::Distribution::Gaussian) {
-      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
-    }
-    else if (dist_kind == cutlass::Distribution::Sequential) {
-      cutlass::reference::host::BlockFillSequential(
-        view.data(), view.capacity());
-    }
-    else if (dist_kind == cutlass::Distribution::AllZeros) {
-      cutlass::reference::host::TensorFill(view, Element(0));
-    }
-    else if (dist_kind == cutlass::Distribution::AllOnes) {
-      cutlass::reference::host::TensorFill(view, Element(1));
-    }
-    else {
-      std::cerr << "Not implemented\n";
-      return false;
-    }
-    return true;
-  }
-  /// Executes one test
-  bool run(
-    cutlass::gemm::GemmCoord problem_size,
-    ElementCompute alpha0 = ElementCompute(1),
-    ElementCompute beta0 = ElementCompute(0),
-    ElementCompute alpha1 = ElementCompute(1),
-    ElementCompute beta1 = ElementCompute(0),
-    bool is_profiling = true,
-    bool relu = false,
-    int warm_ups = 1,
-    int runs = 100) {
-    //
-    // Allocate the GEMM workspace
-    //
-    cutlass::HostTensor<
-      typename Gemm0::ElementA,
-      typename Gemm0::LayoutA> tensor_A0(problem_size.mk());
-    cutlass::HostTensor<
-      typename Gemm0::ElementB,
-      typename Gemm0::LayoutB> tensor_B0(problem_size.kn());
-    cutlass::HostTensor<
-      typename Gemm0::ElementC,
-      typename Gemm0::LayoutC> tensor_C0(problem_size.mn());
-    cutlass::HostTensor<
-      typename Gemm1::ElementC,
-      typename Gemm0::LayoutC> tensor_Bias0({1, problem_size.n()});
-    cutlass::HostTensor<
-      typename Gemm0::ElementC,
-      typename Gemm0::LayoutC> tensor_D0(problem_size.mn());
-    cutlass::HostTensor<
-      typename Gemm0::ElementC,
-      typename Gemm0::LayoutC> reference_D0(problem_size.mn());
-    cutlass::HostTensor<
-      typename Gemm1::ElementB,
-      typename Gemm1::LayoutB> tensor_B1(problem_size.kn());
-    cutlass::HostTensor<
-      typename Gemm1::ElementC,
-      typename Gemm1::LayoutC> tensor_C1(problem_size.mn());
-    cutlass::HostTensor<
-      typename Gemm1::ElementC,
-      typename Gemm1::LayoutC> tensor_Bias1({1, problem_size.n()});
-    cutlass::HostTensor<
-      typename Gemm1::ElementC,
-      typename Gemm1::LayoutC> tensor_D1(problem_size.mn());
-    cutlass::HostTensor<
-      typename Gemm1::ElementC,
-      typename Gemm1::LayoutC> reference_D1(problem_size.mn());
-    CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
-    CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018));
-    CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
-    CHECK_TRUE(initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed + 2014));
-    CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016));
-    CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
-    CHECK_TRUE(initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed + 2013));
-    cutlass::reference::host::TensorFill(
-      tensor_D0.host_view());
-    cutlass::reference::host::TensorFill(
-      tensor_D1.host_view());
-    cutlass::reference::host::TensorFill(
-      reference_D0.host_view());
-    cutlass::reference::host::TensorFill(
-      reference_D1.host_view());
-    tensor_A0.sync_device();
-    tensor_B0.sync_device();
-    tensor_C0.sync_device();
-    tensor_Bias0.sync_device();
-    tensor_D0.sync_device();
-    reference_D0.sync_device();
-    tensor_B1.sync_device();
-    tensor_C1.sync_device();
-    tensor_Bias1.sync_device();
-    tensor_D1.sync_device();
-    reference_D1.sync_device();
-    //
-    // Initialize the GEMM operator
-    //
-    int split_k_slices = Gemm0::kSplitKSerial ? 2 : 1;
-    typename Gemm0::Arguments arguments_0{
-      problem_size,
-      tensor_A0.device_ref(),
-      tensor_B0.device_ref(),
-      {tensor_Bias0.device_data(), typename Gemm0::LayoutC::Stride(0)},
-      tensor_D0.device_ref(),
-      {alpha0, beta0},
-      split_k_slices
-    };
-    split_k_slices = Gemm1::kSplitKSerial ? 2 : 1;
-    typename Gemm1::Arguments arguments_1{
-      problem_size,
-      tensor_A0.device_ref(),
-      tensor_B1.device_ref(),
-      {tensor_Bias1.device_data(), typename Gemm1::LayoutC::Stride(0)},
-      tensor_D1.device_ref(),
-      {alpha1, beta1},
-      split_k_slices
-    };
-    Gemm0 gemm_op_0;
-    Gemm1 gemm_op_1;
-    // Allocate workspace memory
-    cutlass::device_memory::allocation<uint8_t> workspace0(gemm_op_0.get_workspace_size(arguments_0));
-    cutlass::device_memory::allocation<uint8_t> workspace1(gemm_op_1.get_workspace_size(arguments_1));
-    cutlass::Status status = gemm_op_0.initialize(arguments_0, workspace0.get());
-    CUTLASS_CHECK(status);
-    status = gemm_op_1.initialize(arguments_1, workspace1.get());
-    CUTLASS_CHECK(status);
-    for(int i = 0; i < warm_ups; i++) {
-        status = gemm_op_0();
-        CUTLASS_CHECK(status);
-        status = gemm_op_1();
-        CUTLASS_CHECK(status);
-    }
-    if (is_profiling) {
-      //
-      // Profile the GEMM
-      //
-      cudaEvent_t start, stop1, stop2;
-      cudaEventCreate(&start);
-      cudaEventCreate(&stop1);
-      cudaEventCreate(&stop2);
-      cudaEventRecord(start);
-      for(int i = 0; i < runs; i++) {
-          status = gemm_op_0();
-          CUTLASS_CHECK(status);
-      }
-      cudaEventRecord(stop1);
-      for(int i = 0; i < runs; i++) {
-          status = gemm_op_1();
-          CUTLASS_CHECK(status);
-      }
-      cudaEventRecord(stop2);
-      cudaDeviceSynchronize();
-      float gemm0Time, gemm1Time, totalTime;
-      cudaEventElapsedTime(&gemm0Time, start, stop1);
-      cudaEventElapsedTime(&gemm1Time, stop1, stop2);
-      cudaEventElapsedTime(&totalTime, start, stop2);
-      std::cout << "gemm 0 time " << gemm0Time / (float)runs << " ms\n";
-      std::cout << "gemm 1 time " << gemm1Time / (float)runs << " ms\n";
-      std::cout << "Non-fusion GEMM only time " << totalTime / (float)runs << " ms\n";
-    }
-    tensor_D0.sync_host();
-    tensor_D1.sync_host();
-    //
-    // Verify
-    //
-    cutlass::reference::device::Gemm<
-        typename Gemm0::ElementA, typename Gemm0::LayoutA,
-        typename Gemm0::ElementB, typename Gemm0::LayoutB,
-        typename Gemm0::ElementC, typename Gemm0::LayoutC, ElementCompute,
-        ElementAccumulator, typename Gemm0::Operator>
-        reference_gemm_0;
-    cutlass::reference::device::Gemm<
-        typename Gemm1::ElementA, typename Gemm1::LayoutA,
-        typename Gemm1::ElementB, typename Gemm1::LayoutB,
-        typename Gemm1::ElementC, typename Gemm1::LayoutC, ElementCompute,
-        ElementAccumulator, typename Gemm1::Operator>
-        reference_gemm_1;
-    reference_gemm_0(
-      problem_size,
-      alpha0,
-      tensor_A0.device_ref(),
-      tensor_B0.device_ref(),
-      beta0,
-      {tensor_Bias0.device_data(), typename Gemm0::LayoutC::Stride(0)},
-      reference_D0.device_ref()
-    );
-    if(relu) {
-       cutlass::reference::device::TensorReLu(reference_D0.device_view());
-    }
-    reference_gemm_1(
-      problem_size,
-      alpha1,
-      tensor_A0.device_ref(),
-      tensor_B1.device_ref(),
-      beta1,
-      {tensor_Bias1.device_data(), typename Gemm1::LayoutC::Stride(0)},
-      reference_D1.device_ref()
-    );
-    if(relu) {
-       cutlass::reference::device::TensorReLu(reference_D1.device_view());
-    }
-    // Wait for kernels to finish
-    cudaDeviceSynchronize();
-    reference_D0.sync_host();
-    reference_D1.sync_host();
-    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0);
-    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
-    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
-    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
-    bool passed0 = cutlass::reference::host::TensorEquals(
-      reference_D1.host_view(),
-      tensor_D1.host_view());
-    CHECK_TRUE(passed0);
-    bool passed1 = cutlass::reference::host::TensorEquals(
-      reference_D1.host_view(),
-      tensor_D1.host_view());
-    CHECK_TRUE(passed1);
-    if (!passed0 || !passed1) {
-      std::stringstream fname;
-      fname << "error_DualGemm_device_nonfused.txt";
-      std::cerr << "Dumping results in " << fname.str() << "\n";
-      std::ofstream file(fname.str());
-      file
-        << "A0 =\n" << tensor_A0.host_view()
-        << "\nB0 =\n" << tensor_B0.host_view()
-        << "\nC0 =\n" << tensor_C0.host_view()
-        << "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
-        << "\nD0 =\n" << tensor_D0.host_view()
-        << "\nB1 =\n" << tensor_B1.host_view()
-        << "\nC1 =\n" << tensor_C1.host_view()
-        << "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
-        << "\n\nReference =\n" << reference_D1.host_view()
-        << "\nComputed =\n" << tensor_D1.host_view();
-    }
-    return passed0 && passed1;
-  }
-};
-template <typename DualGemm_>
-struct DualFusedGemmRun
-{
-  using DualGemm = DualGemm_;
-  using ElementAccumulator = typename DualGemm::ElementAccumulator;
-  using ElementCompute = typename DualGemm::DualGemmKernel::Epilogue0::OutputOp::ElementCompute;
-  using EpilogueOutputOp2 = typename DualGemm::EpilogueOutputOp2;
-  /// Initialization
-  cutlass::Distribution::Kind init_A;
-  cutlass::Distribution::Kind init_B;
-  cutlass::Distribution::Kind init_C;
-  cutlass::Distribution::Kind init_Scale;
-  cutlass::Distribution::Kind init_Bias;
-  uint64_t seed;
-  //
-  // Methods
-  //
-  DualFusedGemmRun(
-    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
-    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
-    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
-    cutlass::Distribution::Kind init_Scale_ = cutlass::Distribution::Uniform,
-    cutlass::Distribution::Kind init_Bias_ = cutlass::Distribution::Uniform,
-    uint64_t seed_ = 2080
-  ):
-    init_A(init_A_), init_B(init_B_), init_C(init_C_),
-    init_Scale(init_Scale_), init_Bias(init_Bias_), seed(seed_) { }
-  /// Helper to initialize a tensor view
-  template <typename Element, typename Layout>
-  bool initialize_tensor(
-    cutlass::TensorView<Element, Layout> view,
-    cutlass::Distribution::Kind dist_kind,
-    uint64_t seed) {
-    if (dist_kind == cutlass::Distribution::Uniform) {
-      cutlass::reference::host::TensorFillRandomUniform(
-        view, seed, 2, -2, 0);
-    }
-    else if (dist_kind == cutlass::Distribution::Identity) {
-      cutlass::reference::host::TensorFillIdentity(view);
-    }
-    else if (dist_kind == cutlass::Distribution::Gaussian) {
-      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
-    }
-    else if (dist_kind == cutlass::Distribution::Sequential) {
-      cutlass::reference::host::BlockFillSequential(
-        view.data(), view.capacity());
-    }
-    else if (dist_kind == cutlass::Distribution::AllZeros) {
-      cutlass::reference::host::TensorFill(view, Element(0));
-    }
-    else if (dist_kind == cutlass::Distribution::AllOnes) {
-      cutlass::reference::host::TensorFill(view, Element(1));
-    }
-    else {
-      std::cerr << "Not implemented\n";
-      return false;
-    }
-    return true;
-  }
-  /// Executes one test
-  bool run(
-    cutlass::gemm::GemmCoord problem_size,
-    ElementCompute alpha0 = ElementCompute(1),
-    ElementCompute beta0 = ElementCompute(1),
-    ElementCompute alpha1 = ElementCompute(1),
-    ElementCompute beta1 = ElementCompute(1),
-    int batch_count = 1,
-    bool broadcast_b1 = false,
-    bool is_profiling = true,
-    bool relu = false,
-    int warm_ups = 1,
-    int runs = 100) {
-    //
-    // Allocate the GEMM workspace
-    //
-    cutlass::HostTensor<
-      typename DualGemm::ElementA,
-      typename DualGemm::LayoutA> tensor_A0(
-        cutlass::platform::is_same<typename DualGemm::LayoutA, cutlass::layout::RowMajor>::value ?
-          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.k()) :
-          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.k()));
-    cutlass::HostTensor<
-      typename DualGemm::ElementB,
-      typename DualGemm::LayoutB0> tensor_B0(
-        cutlass::platform::is_same<typename DualGemm::LayoutB0, cutlass::layout::RowMajor>::value ?
-          cutlass::MatrixCoord(batch_count * problem_size.k(), problem_size.n()) :
-          cutlass::MatrixCoord(problem_size.k(), batch_count * problem_size.n()));
-    cutlass::HostTensor<
-      typename DualGemm::ElementC,
-      typename DualGemm::LayoutC> tensor_C0(
-        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
-          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
-          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
-    cutlass::HostTensor<
-      typename DualGemm::ElementC,
-      typename DualGemm::LayoutScaleBias> tensor_Bias0({batch_count, problem_size.n()});
-    cutlass::HostTensor<
-      typename DualGemm::ElementC,
-      typename DualGemm::LayoutC> tensor_D0(
-        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
-          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
-          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
-    cutlass::HostTensor<
-      typename DualGemm::ElementC,
-      typename DualGemm::LayoutC> reference_D0(
-        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
-          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
-          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
-    cutlass::HostTensor<
-      typename DualGemm::ElementB,
-      typename DualGemm::LayoutB1> tensor_B1(
-        cutlass::platform::is_same<typename DualGemm::LayoutB1, cutlass::layout::RowMajor>::value ?
-          cutlass::MatrixCoord(batch_count * problem_size.k(), problem_size.n()) :
-          cutlass::MatrixCoord(problem_size.k(), batch_count * problem_size.n()));
-    if (broadcast_b1) {
-      tensor_B1.resize({problem_size.k(), batch_count});
-    }
-    cutlass::HostTensor<
-      typename DualGemm::ElementC,
-      typename DualGemm::LayoutC> tensor_C1(
-        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
-          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
-          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
-    cutlass::HostTensor<
-      typename DualGemm::ElementC,
-      typename DualGemm::LayoutScaleBias> tensor_Bias1({batch_count, problem_size.n()});
-    cutlass::HostTensor<
-      typename DualGemm::ElementC,
-      typename DualGemm::LayoutC> tensor_D1(
-        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
-          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
-          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
-    cutlass::HostTensor<
-      typename DualGemm::ElementC,
-      typename DualGemm::LayoutC> tensor_D2(
-        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
-          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
-          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
-    cutlass::HostTensor<
-      typename DualGemm::ElementC,
-      typename DualGemm::LayoutC> reference_D1(
-        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
-          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
-          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
-    cutlass::HostTensor<
-      typename DualGemm::ElementC,
-      typename DualGemm::LayoutC> reference_D2(
-        cutlass::platform::is_same<typename DualGemm::LayoutC, cutlass::layout::RowMajor>::value ?
-          cutlass::MatrixCoord(batch_count * problem_size.m(), problem_size.n()) :
-          cutlass::MatrixCoord(problem_size.m(), batch_count * problem_size.n()));
-    CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019));
-    CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2118));
-    CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017));
-    CHECK_TRUE(initialize_tensor(tensor_Bias0.host_view(), init_Bias, seed + 2011));
-    CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2113));
-    CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015));
-    CHECK_TRUE(initialize_tensor(tensor_Bias1.host_view(), init_Bias, seed + 2012));
-    cutlass::reference::host::TensorFill(
-      tensor_D0.host_view());
-    cutlass::reference::host::TensorFill(
-      tensor_D1.host_view());
-    cutlass::reference::host::TensorFill(
-      tensor_D2.host_view());
-    cutlass::reference::host::TensorFill(
-      reference_D0.host_view());
-    cutlass::reference::host::TensorFill(
-      reference_D1.host_view());
-    cutlass::reference::host::TensorFill(
-      reference_D2.host_view());
-    tensor_A0.sync_device();
-    tensor_B0.sync_device();
-    tensor_C0.sync_device();
-    tensor_Bias0.sync_device();
-    tensor_B1.sync_device();
-    tensor_C1.sync_device();
-    tensor_Bias1.sync_device();
-    tensor_D0.sync_device();
-    tensor_D1.sync_device();
-    tensor_D2.sync_device();
-    reference_D0.sync_device();
-    reference_D1.sync_device();
-    reference_D2.sync_device();
-    //
-    // Batch strides (irrelevant when batch_count == 1)
-    //
-    int64_t batch_stride_A = problem_size.m() * problem_size.k();
-    int64_t batch_stride_B0 = problem_size.k() * problem_size.n();
-    int64_t batch_stride_B1 = problem_size.k() * problem_size.n();
-    if (broadcast_b1) {
-      // B1 is a (column) vector
-      batch_stride_B1 = problem_size.k();
-    }
-    int64_t batch_stride_Bias = problem_size.n();
-    int64_t batch_stride_D = problem_size.m() * problem_size.n();
-    //
-    // Initialize the GEMM operator
-    //
-    int split_k_slices = DualGemm::kSplitKSerial ? 2 : 1;
-    typename cutlass::TensorRef<typename DualGemm::ElementC, typename DualGemm::LayoutC> nullptr_ref{};
-    decltype(nullptr_ref) ref_B0, ref_B1;
-    if (beta0 != ElementCompute(0)) {
-      ref_B0 = {tensor_Bias0.device_data(), typename DualGemm::LayoutC::Stride(0)};
-    }
-    if (beta1 != ElementCompute(0)) {
-      ref_B1 = {tensor_Bias1.device_data(), typename DualGemm::LayoutC::Stride(0)};
-    }
-    typename DualGemm::Arguments arguments{
-      (batch_count > 1 ?
-        cutlass::gemm::DualGemmMode::kBatched :
-        cutlass::gemm::DualGemmMode::kGemm),
-      problem_size,
-      tensor_A0.device_ref(),
-      tensor_B0.device_ref(),
-      ref_B0,
-      DualGemm::kStoreD0 ? tensor_D0.device_ref() : nullptr_ref,
-      (broadcast_b1 ?
-        typename DualGemm::TensorRefB1(tensor_B1.device_data(), 0) :
-        tensor_B1.device_ref()),
-      ref_B1,
-      DualGemm::kStoreD1 ? tensor_D1.device_ref() : nullptr_ref,
-      tensor_D2.device_ref(),
-      {alpha0, beta0},
-      {alpha1, beta1},
-      {},
-      split_k_slices,
-      batch_count,
-      batch_stride_A,
-      batch_stride_B0,
-      batch_stride_B1,
-      batch_stride_Bias,
-      batch_stride_D,
-    };
-    //
-    // Run the GEMM
-    //
-    DualGemm b2b_gemm_op;
-    cutlass::device_memory::allocation<uint8_t> workspace(b2b_gemm_op.get_workspace_size(arguments));
-    cutlass::Status status = b2b_gemm_op.can_implement(arguments);
-    CUTLASS_CHECK(status);
-    status = b2b_gemm_op.initialize(arguments, workspace.get());
-    CUTLASS_CHECK(status);
-    for(int i = 0; i < warm_ups; i++) {
-        status = b2b_gemm_op();
-        CUTLASS_CHECK(status);
-    }
-    if (is_profiling) {
-      //
-      // Profile the GEMM
-      //
-      cudaEvent_t start, stop;
-      cudaEventCreate(&start);
-      cudaEventCreate(&stop);
-      cudaEventRecord(start);
-      for(int i = 0; i < runs; i++) {
-          status = b2b_gemm_op();
-          CUTLASS_CHECK(status);
-      }
-      cudaEventRecord(stop);
-      cudaDeviceSynchronize();
-      float gemmTime;
-      cudaEventElapsedTime(&gemmTime, start, stop);
-      std::cout << "Fusion time " << gemmTime / (float)runs << " ms\n";
-    }
-    tensor_D0.sync_host();
-    tensor_D1.sync_host();
-    tensor_D2.sync_host();
-    //
-    // Verify
-    //
-    using GemmUniversal0 = cutlass::gemm::device::GemmUniversal<
-      typename DualGemm::ElementA, typename DualGemm::LayoutA,
-      typename DualGemm::ElementB, typename DualGemm::LayoutB0,
-      typename DualGemm::ElementC, typename DualGemm::LayoutC,
-      ElementAccumulator
-    >;
-    GemmUniversal0 reference_gemm0;
-    typename GemmUniversal0::Arguments args0 {
-      (batch_count > 1 ?
-        cutlass::gemm::GemmUniversalMode::kBatched :
-        cutlass::gemm::GemmUniversalMode::kGemm),
-      problem_size,
-      batch_count,
-      {alpha0, beta0},
-      tensor_A0.device_data(),
-      tensor_B0.device_data(),
-      tensor_Bias0.device_data(),
-      reference_D0.device_data(),
-      batch_stride_A,
-      batch_stride_B0,
-      batch_stride_Bias,
-      batch_stride_D,
-      tensor_A0.stride(0),
-      tensor_B0.stride(0),
-      0,  // zero stride for the bias vector
-      reference_D0.stride(0),
-    };
-    status = reference_gemm0.can_implement(args0);
-    CUTLASS_CHECK(status);
-    status = reference_gemm0(args0);
-    CUTLASS_CHECK(status);
-    using GemmUniversal1 = cutlass::gemm::device::GemmUniversal<
-      typename DualGemm::ElementA, typename DualGemm::LayoutA,
-      typename DualGemm::ElementB, typename DualGemm::LayoutB1,
-      typename DualGemm::ElementC, typename DualGemm::LayoutC,
-      ElementAccumulator
-    >;
-    GemmUniversal1 reference_gemm1;
-    typename GemmUniversal1::Arguments args1 {
-      (batch_count > 1 ?
-        cutlass::gemm::GemmUniversalMode::kBatched :
-        cutlass::gemm::GemmUniversalMode::kGemm),
-      problem_size,
-      batch_count,
-      {alpha1, beta1},
-      tensor_A0.device_data(),
-      tensor_B1.device_data(),
-      tensor_Bias1.device_data(),
-      reference_D1.device_data(),
-      batch_stride_A,
-      batch_stride_B1,
-      batch_stride_Bias,
-      batch_stride_D,
-      tensor_A0.stride(0),
-      (broadcast_b1 ? 0 : tensor_B1.stride(0)),
-      0,  // zero stride for the bias vector
-      reference_D1.stride(0),
-    };
-    status = reference_gemm1.can_implement(args1);
-    CUTLASS_CHECK(status);
-    status = reference_gemm1(args1);
-    CUTLASS_CHECK(status);
-    if(relu) {
-       cutlass::reference::device::TensorReLu(reference_D0.device_view());
-       cutlass::reference::device::TensorReLu(reference_D1.device_view());
-    }
-    TensorEpilogueForEach<EpilogueOutputOp2>(reference_D0.device_view(), reference_D1.device_view(), reference_D2.device_view());
-    cudaDeviceSynchronize();
-    reference_D0.sync_host();
-    reference_D1.sync_host();
-    reference_D2.sync_host();
-    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0);
-    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
-    CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D2.host_view()), 0);
-    CHECK_GT(cutlass::reference::host::TensorNorm(reference_D2.host_view()), 0);
-    bool passed_out0 = true;
-    if (DualGemm::kStoreD0) {
-      CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0);
-      passed_out0 = cutlass::reference::host::TensorEquals(
-        reference_D0.host_view(),
-        tensor_D0.host_view());
-    }
-    CHECK_TRUE(passed_out0);
-    bool passed_out1 = true;
-    if (DualGemm::kStoreD1) {
-      CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
-      passed_out1 = cutlass::reference::host::TensorEquals(
-        reference_D1.host_view(),
-        tensor_D1.host_view());
-    }
-    CHECK_TRUE(passed_out1);
-    bool passed_out2 = cutlass::reference::host::TensorEquals(
-      reference_D2.host_view(),
-      tensor_D2.host_view());
-    CHECK_TRUE(passed_out2);
-    bool passed = passed_out0 && passed_out1 && passed_out2;
-    if (!passed)
-    {
-      std::stringstream fname;
-      fname << "error_DualGemm_device_fused.txt";
-      std::cerr << "Dumping results in " << fname.str() << "\n";
-      std::ofstream file(fname.str());
-      file
-        << "A0 =\n" << tensor_A0.host_view()
-        << "\nB0 =\n" << tensor_B0.host_view()
-        << "\nC0 =\n" << tensor_C0.host_view()
-        << "\nBias0:\n" << tensor_Bias0.host_view() << "\n"
-        << "\nB1 =\n" << tensor_B1.host_view()
-        << "\nC1 =\n" << tensor_C1.host_view()
-        << "\nBias1:\n" << tensor_Bias1.host_view() << "\n"
-        << "\n\nReference0 =\n" << reference_D0.host_view()
-        << "\nComputed0 =\n" << tensor_D0.host_view()
-        << "\n\nReference1 =\n" << reference_D1.host_view()
-        << "\nComputed1 =\n" << tensor_D1.host_view()
-        << "\n\nReference2 =\n" << reference_D2.host_view()
-        << "\nComputed2 =\n" << tensor_D2.host_view();
-    }
-    //std::cout << "A0 " << tensor_A0.host_view() << std::endl;
-    // std::cout << "reference_D0 " << reference_D0.host_view() << std::endl;
-    // std::cout << "reference_D1 " << reference_D1.host_view() << std::endl;
-    // std::cout << "reference_D2 " << reference_D2.host_view() << std::endl;
-    //std::cout << "reference_D0 " << reference_D0.host_view() << std::endl;
-    return passed;
-  }
-};
-////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/kernel/dual_gemm.h DELETED Viewed

@@ -1,545 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
-*/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/semaphore.h"
-#include "../threadblock/dual_mma_multistage.h"
-#include "../threadblock/dual_epilogue.h"
-#include "../dual_gemm_common.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace gemm {
-namespace kernel {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename DualMma_,               ///! Threadblock-scoped matrix multiply-accumulate
-  typename Epilogue0_,             ///! Epilogue
-  typename Epilogue1_,             ///! Epilogue
-  typename OutputOp2_,             ///! Epilogue
-  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
-  bool SplitKSerial,              ///! If true, code supporting split-K via serial reduction is enabled.
-  bool StoreD0,
-  bool StoreD1
->
-struct DualGemm {
-  using DualMma = DualMma_;
-  using Epilogue0 = Epilogue0_;
-  using Epilogue1 = Epilogue1_;
-  using OutputOp0 = typename Epilogue0::OutputOp;
-  using OutputOp1 = typename Epilogue1::OutputOp;
-  using OutputOp2 = OutputOp2_;
-  using ThreadblockSwizzle = ThreadblockSwizzle_;
-  static constexpr bool kStoreD0 = StoreD0;
-  static constexpr bool kStoreD1 = StoreD1;
-  using DualEpilogue = cutlass::epilogue::threadblock::DualEpilogue<
-      typename Epilogue0::Shape,
-      typename Epilogue0::WarpMmaOperator,
-      Epilogue0::kPartitionsK,
-      typename Epilogue0::OutputTileIterator,
-      typename Epilogue0::AccumulatorFragmentIterator,
-      typename Epilogue0::WarpTileIterator,
-      typename Epilogue0::SharedLoadIterator,
-      OutputOp0,
-      OutputOp1,
-      OutputOp2,
-      typename Epilogue0::Padding,
-      kStoreD0,
-      kStoreD1,
-      Epilogue0::kFragmentsPerIteration,
-      true // IterationsUnroll
-  >;
-  using ElementA = typename DualMma::IteratorA::Element;
-  using ElementB = typename DualMma::IteratorB0::Element;
-  using ElementC = typename DualEpilogue::OutputTileIterator::Element;
-  static bool const kSplitKSerial = SplitKSerial;
-  static_assert(!kSplitKSerial || (kStoreD0 && kStoreD1),
-    "Split-K serial requires buffers for D0/D1 for reduction");
-  /// Warp count (concept: GemmShape)
-  using WarpCount0 = typename DualMma::WarpCount;
-  static int const kThreadCount = 32 * WarpCount0::kCount;
-  /// Parameters structure
-  struct Params {
-    DualGemmMode mode;
-    cutlass::gemm::GemmCoord problem_size;
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int swizzle_log_tile;
-    // Mma0
-    typename DualMma::IteratorA::Params params_A0;
-    typename DualMma::IteratorA::TensorRef ref_A0;
-    typename DualMma::IteratorB0::Params params_B0;
-    typename DualMma::IteratorB0::TensorRef ref_B0;
-    typename Epilogue0::OutputTileIterator::Params params_C0;
-    typename Epilogue0::OutputTileIterator::TensorRef ref_C0;
-    typename Epilogue0::OutputTileIterator::Params params_D0;
-    typename Epilogue0::OutputTileIterator::TensorRef ref_D0;
-    typename OutputOp0::Params output_op_0;
-    // Mma1
-    typename DualMma::IteratorB1::Params params_B1;
-    typename DualMma::IteratorB1::TensorRef ref_B1;
-    typename Epilogue1::OutputTileIterator::Params params_C1;
-    typename Epilogue1::OutputTileIterator::TensorRef ref_C1;
-    typename Epilogue1::OutputTileIterator::Params params_D1;
-    typename Epilogue1::OutputTileIterator::TensorRef ref_D1;
-    typename OutputOp1::Params output_op_1;
-    typename Epilogue1::OutputTileIterator::Params params_D2;
-    typename Epilogue1::OutputTileIterator::TensorRef ref_D2;
-    typename OutputOp2::Params output_op_2;
-    int *semaphore;
-    int gemm_k_size;
-    int64_t batch_stride_A;
-    int64_t batch_stride_B0;
-    int64_t batch_stride_B1;
-    int64_t batch_stride_C;
-    int64_t batch_stride_D;
-    //
-    // Methods
-    //
-    CUTLASS_HOST_DEVICE
-    Params(): swizzle_log_tile(0), semaphore(0), gemm_k_size(0) { }
-    CUTLASS_HOST_DEVICE
-    Params(
-      DualGemmMode mode,
-      cutlass::gemm::GemmCoord const & problem_size,
-      cutlass::gemm::GemmCoord const & grid_tiled_shape,
-      // Mma0: D0 = A @ B0 + C0
-      typename DualMma::IteratorA::TensorRef ref_A0,
-      typename DualMma::IteratorB0::TensorRef ref_B0,
-      typename Epilogue0::OutputTileIterator::TensorRef ref_C0,
-      typename Epilogue0::OutputTileIterator::TensorRef ref_D0,
-      // Mma1: D1 = A @ B1 + C1
-      typename DualMma::IteratorB1::TensorRef ref_B1,
-      typename Epilogue1::OutputTileIterator::TensorRef ref_C1,
-      typename Epilogue1::OutputTileIterator::TensorRef ref_D1,
-      typename Epilogue1::OutputTileIterator::TensorRef ref_D2,
-      typename OutputOp0::Params output_op_0 = typename OutputOp0::Params(),
-      typename OutputOp1::Params output_op_1 = typename OutputOp1::Params(),
-      typename OutputOp2::Params output_op_2 = typename OutputOp2::Params(),
-      int *workspace = nullptr,
-      int64_t batch_stride_A = 1,
-      int64_t batch_stride_B0 = 1,
-      int64_t batch_stride_B1 = 1,
-      int64_t batch_stride_C = 1,
-      int64_t batch_stride_D = 1
-    ):
-      mode(mode),
-      problem_size(problem_size),
-      grid_tiled_shape(grid_tiled_shape),
-      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
-      // Mma0
-      params_A0(ref_A0.layout()),
-      ref_A0(ref_A0),
-      params_B0(ref_B0.layout()),
-      ref_B0(ref_B0),
-      params_C0(ref_C0.layout()),
-      ref_C0(ref_C0),
-      params_D0(ref_D0.layout()),
-      ref_D0(ref_D0),
-      // Mma1
-      params_B1(ref_B1.layout()),
-      ref_B1(ref_B1),
-      params_C1(ref_C1.layout()),
-      ref_C1(ref_C1),
-      params_D1(ref_D1.layout()),
-      ref_D1(ref_D1),
-      params_D2(ref_D2.layout()),
-      ref_D2(ref_D2),
-      output_op_0(output_op_0),
-      output_op_1(output_op_1),
-      output_op_2(output_op_2),
-      batch_stride_A(batch_stride_A),
-      batch_stride_B0(batch_stride_B0),
-      batch_stride_B1(batch_stride_B1),
-      batch_stride_C(batch_stride_C),
-      batch_stride_D(batch_stride_D) {
-      int total_gemm_k_iterations = (problem_size.k() + DualMma::Shape::kK - 1) / DualMma::Shape::kK;
-      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
-      gemm_k_size = gemm_k_iterations * DualMma::Shape::kK;
-      semaphore = workspace;
-    }
-  };
-  /// Shared memory storage structure
-  union SharedStorage {
-    typename DualMma::SharedStorage main_loop;
-    typename DualEpilogue::SharedStorage epilogue;
-  };
-  //
-  // Methods
-  //
-  CUTLASS_HOST_DEVICE
-  DualGemm() { }
-  /// Determines whether kernel satisfies alignment
-    static Status can_implement(
-      cutlass::gemm::GemmCoord const & problem_size,
-      typename DualMma::IteratorA::TensorRef ref_A0,
-      typename DualMma::IteratorB0::TensorRef ref_B0,
-      typename Epilogue0::OutputTileIterator::TensorRef ref_C0,
-      typename Epilogue0::OutputTileIterator::TensorRef ref_D0,
-      typename DualMma::IteratorB1::TensorRef ref_B1,
-      typename Epilogue1::OutputTileIterator::TensorRef ref_C1,
-      typename Epilogue1::OutputTileIterator::TensorRef ref_D1,
-      typename Epilogue1::OutputTileIterator::TensorRef ref_D2) {
-    static int const kAlignmentA = DualMma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = DualMma::IteratorB0::AccessType::kElements;
-    static int const kAlignmentC = Epilogue0::OutputTileIterator::kElementsPerAccess;
-    if (!TensorRef_aligned(ref_A0, kAlignmentA)) {
-      return Status::kErrorMisalignedOperand;
-    }
-    if (!TensorRef_aligned(ref_B0, kAlignmentB)) {
-      return Status::kErrorMisalignedOperand;
-    }
-    if (!TensorRef_aligned(ref_C0, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-    if (!TensorRef_aligned(ref_D0, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-    if (!TensorRef_aligned(ref_B1, kAlignmentB)) {
-      return Status::kErrorMisalignedOperand;
-    }
-    if (!TensorRef_aligned(ref_C1, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-    if (!TensorRef_aligned(ref_D1, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-    if (!TensorRef_aligned(ref_D2, kAlignmentC)) {
-      return Status::kErrorMisalignedOperand;
-    }
-    return Status::kSuccess;
-  }
-  /// Executes one GEMM
-  CUTLASS_DEVICE
-  void operator()(Params const &params, SharedStorage &shared_storage) {
-    // Compute threadblock location
-    ThreadblockSwizzle threadblock_swizzle;
-    cutlass::gemm::GemmCoord threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-    // Early exit if CTA is out of range
-    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
-      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
-      return;
-    }
-    int offset_k = 0;
-    int problem_size_k = params.problem_size.k();
-    ElementA *ptr_A0 = static_cast<ElementA *>(params.ref_A0.data());
-    ElementB *ptr_B0 = static_cast<ElementB *>(params.ref_B0.data());
-    ElementB *ptr_B1 = static_cast<ElementB *>(params.ref_B1.data());
-    //
-    // Fetch pointers based on mode.
-    //
-    if (params.mode == DualGemmMode::kGemm) {
-      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
-        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
-      }
-      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
-    }
-    else if (params.mode == DualGemmMode::kBatched) {
-      ptr_A0 += threadblock_tile_offset.k() * params.batch_stride_A;
-      ptr_B0 += threadblock_tile_offset.k() * params.batch_stride_B0;
-      ptr_B1 += threadblock_tile_offset.k() * params.batch_stride_B1;
-    }
-    // Compute initial location in logical coordinates
-    cutlass::MatrixCoord tb_offset_A0{
-      threadblock_tile_offset.m() * DualMma::Shape::kM,
-      offset_k,
-    };
-    cutlass::MatrixCoord tb_offset_B0{
-      offset_k,
-      threadblock_tile_offset.n() * DualMma::Shape::kN
-    };
-    cutlass::MatrixCoord tb_offset_B1{
-      offset_k,
-      threadblock_tile_offset.n() * DualMma::Shape::kN
-    };
-    // Compute position within threadblock
-    int thread_idx = threadIdx.x;
-    // Construct iterators to A and B operands
-    typename DualMma::IteratorA iterator_A0(
-      params.params_A0,
-      ptr_A0,
-      {params.problem_size.m(), problem_size_k},
-      thread_idx,
-      tb_offset_A0);
-    typename DualMma::IteratorB0 iterator_B0(
-      params.params_B0,
-      ptr_B0,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B0);
-    typename DualMma::IteratorB1 iterator_B1(
-      params.params_B1,
-      ptr_B1,
-      {problem_size_k, params.problem_size.n()},
-      thread_idx,
-      tb_offset_B1);
-    // Broadcast the warp_id computed by lane 0 to ensure dependent code
-    // is compiled as warp-uniform.
-    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
-    int lane_idx = threadIdx.x % 32;
-    //
-    // Main loop
-    //
-    // Construct thread-scoped matrix multiply
-    typename DualMma::FragmentC accum0;
-    typename DualMma::FragmentC accum1;
-    accum0.clear();
-    accum1.clear();
-    // Compute threadblock-scoped matrix multiply-add
-    int gemm_k_iterations = (problem_size_k - offset_k + DualMma::Shape::kK - 1) / DualMma::Shape::kK;
-    DualMma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
-    if (!kSplitKSerial || gemm_k_iterations > 0) {
-      // Compute threadblock-scoped matrix multiply-add
-      mma(gemm_k_iterations,
-        accum0, accum1,
-        iterator_A0, iterator_B0, iterator_B1,
-        accum0, accum1);
-    }
-    //
-    // Epilogue
-    //
-    OutputOp0 output_op_0(params.output_op_0);
-    OutputOp1 output_op_1(params.output_op_1);
-    OutputOp2 output_op_2(params.output_op_2);
-    //
-    // Masked tile iterators constructed from members
-    //
-    threadblock_tile_offset =
-        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
-    //assume identity swizzle
-    MatrixCoord threadblock_offset(
-      threadblock_tile_offset.m() * DualMma::Shape::kM,
-      threadblock_tile_offset.n() * DualMma::Shape::kN
-    );
-    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
-    ElementC *ptr_C0 = static_cast<ElementC *>(params.ref_C0.data());
-    ElementC *ptr_C1 = static_cast<ElementC *>(params.ref_C1.data());
-    ElementC *ptr_D0 = static_cast<ElementC *>(params.ref_D0.data());
-    ElementC *ptr_D1 = static_cast<ElementC *>(params.ref_D1.data());
-    ElementC *ptr_D2 = static_cast<ElementC *>(params.ref_D2.data());
-    // Construct the semaphore.
-    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
-    if (params.mode == DualGemmMode::kGemm) {
-      // If performing a reduction via split-K, fetch the initial synchronization
-      if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-        // Fetch the synchronization lock initially but do not block.
-        semaphore.fetch();
-        // Indicate which position in a serial reduction the output operator is currently updating
-        output_op_0.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-        output_op_1.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
-      }
-    }
-    else if (params.mode == DualGemmMode::kBatched) {
-      ptr_C0 += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_C1 += threadblock_tile_offset.k() * params.batch_stride_C;
-      ptr_D0 += threadblock_tile_offset.k() * params.batch_stride_D;
-      ptr_D1 += threadblock_tile_offset.k() * params.batch_stride_D;
-      ptr_D2 += threadblock_tile_offset.k() * params.batch_stride_D;
-    }
-    // Tile iterator loading from source tensor.
-    typename Epilogue0::OutputTileIterator iterator_C0(
-      params.params_C0,
-      ptr_C0,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-    typename Epilogue1::OutputTileIterator iterator_C1(
-      params.params_C1,
-      ptr_C1,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-    // Tile iterator writing to destination tensor.
-    typename Epilogue0::OutputTileIterator iterator_D0(
-      params.params_D0,
-      ptr_D0,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-    typename Epilogue1::OutputTileIterator iterator_D1(
-      params.params_D1,
-      ptr_D1,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-    typename Epilogue1::OutputTileIterator iterator_D2(
-      params.params_D2,
-      ptr_D2,
-      params.problem_size.mn(),
-      thread_idx,
-      threadblock_offset
-    );
-    DualEpilogue epilogue(
-      shared_storage.epilogue,
-      thread_idx,
-      warp_idx,
-      lane_idx);
-    // Wait on the semaphore - this latency may have been covered by iterator construction
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
-      if (threadblock_tile_offset.k()) {
-        iterator_C0 = iterator_D0;
-        iterator_C1 = iterator_D1;
-      }
-      semaphore.wait(threadblock_tile_offset.k());
-      __threadfence();
-    }
-    // Execute the epilogue operator to update the destination tensor.
-    typename Epilogue0::OutputTileIterator source_iters[] = {
-      iterator_C0, iterator_C1
-    };
-    const bool writeToD2 = (!kSplitKSerial || params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1);
-    epilogue(
-      output_op_0, output_op_1, output_op_2,
-      iterator_D0, iterator_D1, iterator_D2,
-      accum0, accum1,
-      source_iters,
-      writeToD2
-    );
-    //
-    // Release the semaphore
-    //
-    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
-      int lock = 0;
-      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
-        // The final threadblock resets the semaphore for subsequent grids.
-        lock = 0;
-      }
-      else {
-        // Otherwise, the semaphore is incremented
-        lock = threadblock_tile_offset.k() + 1;
-      }
-      __threadfence();
-      semaphore.release(lock);
-    }
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace kernel
-} // namespace gemm
-} // namespace cutlass

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/test_run.h DELETED Viewed

@@ -1,95 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#include <iostream>
-// Run tests on GPUs
-int testRun(int arch, std::vector<bool (*)()> & test_funcs, const std::string & test_name) {
-  bool supported = false;
-  int arch_major = arch / 10;
-  int arch_minor = arch - arch / 10 * 10;
-  if(arch_major >= 8) {
-    // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
-    //
-    // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
-    if (__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0)) {
-      supported = true;
-    }
-  }
-  else if(arch_major >= 7) {
-    // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
-    //
-    // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
-    if (__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) {
-      supported = true;
-    }
-  }
-  cudaDeviceProp props;
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-  if (props.major < arch_major || (props.major == arch_major && props.minor < arch_minor) ) {
-    supported = false;
-  }
-  if (!supported) {
-    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
-    std::cout << "This example isn't supported on current architecture" << std::endl;
-    return 0;
-  }
-  bool pass = true;
-  std::cout << "Device: " << props.name << std::endl;
-  std::cout << "Arch: SM" << arch << std::endl;
-  std::cout << "Test: " << test_name << std::endl;
-  for(auto func : test_funcs) {
-    pass &= func();
-  }
-  if(pass)
-    return 0;
-  else
-    return -1;
-}

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/thread/left_silu_and_mul.h DELETED Viewed

@@ -1,150 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing linear combination operations used by epilogues.
-*/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/array.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/scale_type.h"
-#include "cutlass/epilogue/thread/linear_combination_params.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace epilogue {
-namespace thread {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Applies a linear combination operator to an array of elements.
-///
-/// D = alpha * accumulator + beta * source + uniform
-///
-template <
-  typename ElementOutput_,                             ///< Data type used to load and store tensors
-  int Count,                                           ///< Number of elements computed per operation.
-                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
-                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
-  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
-  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
->
-class LeftSiLUAndMul {
-public:
-  using ElementOutput = ElementOutput_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  static int const kCount = Count;
-  using FragmentOutput = Array<ElementOutput, kCount>;
-  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
-  using ComputeFragment = Array<ElementCompute, kCount>;
-  static FloatRoundStyle const kRound = Round;
-  struct Params{};
-private:
-  //
-  // Data members
-  //
-  ElementCompute alpha_;
-  ElementCompute beta_;
-public:
-  /// Constructs the function object, possibly loading from pointers in host memory
-  CUTLASS_HOST_DEVICE
-  LeftSiLUAndMul(Params const &/*params*/) {}
-  /// Returns true if source is needed
-  CUTLASS_HOST_DEVICE
-  bool is_source_needed() const {
-    return true;
-  }
-  /// Functionally required for serial reduction in the epilogue
-  CUTLASS_HOST_DEVICE
-  void set_k_partition(int k_partition, int k_partition_count) {
-    assert(false);
-  }
-  /// Computes linear scaling: D = alpha * accumulator + beta * source
-  CUTLASS_HOST_DEVICE
-  FragmentOutput operator()(
-    FragmentAccumulator const &lhs,
-    FragmentAccumulator const &rhs) const {
-    // Convert source to interal compute numeric type
-    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_to_compute;
-    // Convert to destination numeric type
-    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> compute_to_output;
-    ComputeFragment converted_lhs = accumulator_to_compute(lhs);
-    ComputeFragment converted_rhs = accumulator_to_compute(rhs);
-    cutlass::epilogue::thread::SiLu<ComputeFragment> silu;
-    cutlass::multiplies<ComputeFragment> mul;
-    auto silu_lhs = silu(converted_lhs);
-    return compute_to_output(mul(silu_lhs, converted_rhs));
-  }
-  CUTLASS_HOST_DEVICE
-  ElementOutput operator()(
-      ElementAccumulator const& lhs,
-      ElementAccumulator const& rhs
-  ) const {
-      ElementCompute convert_lhs(lhs);
-      ElementCompute convert_rhs(rhs);
-      cutlass::epilogue::thread::SiLu<ElementCompute> silu;
-      cutlass::multiplies<ElementCompute> mul;
-      auto silu_lhs = silu(convert_lhs);
-      return ElementOutput(mul(silu_lhs, convert_rhs));
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
-/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/threadblock/dual_epilogue.h DELETED Viewed

@@ -1,424 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
-  The epilogue rearranges the result of a matrix product through shared memory to match canonical
-  tensor layouts in global memory. Epilogues support conversion and reduction operations.
-*/
-#pragma once
-#include "cutlass/array.h"
-#include CUDA_STD_HEADER(cassert)
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/functional.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/transform/pitch_linear_thread_map.h"
-#include "cutlass/transform/threadblock/regular_tile_iterator.h"
-#include "cutlass/epilogue/threadblock/epilogue_base.h"
-#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/numeric_types.h"
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace epilogue {
-namespace threadblock {
-////////////////////////////////////////////////////////////////////////////////
-/// Epilogue operator
-template <
-  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
-  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
-  int PartitionsK,                          ///< Number of partitions of the K dimension
-  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
-  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
-  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
-  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
-  ///< Output operator
-  typename OutputOp0_,
-  typename OutputOp1_,
-  typename OutputOp2_,
-  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
-  bool StoreD0 = true,
-  bool StoreD1 = true,
-  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
-  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
-    (!IsEpilogueFunctorHeavy<OutputOp0_>::value)
->
-class DualEpilogue {
-public:
-  using Base = EpilogueBase<
-    Shape_,
-    typename WarpMmaOperator_::Shape,
-    PartitionsK,
-    AccumulatorFragmentIterator_,
-    WarpTileIterator_,
-    Padding_,
-    FragmentsPerPartition>;
-  using Shape = Shape_;
-  using WarpMmaOperator = WarpMmaOperator_;
-  static int const kPartitionsK = PartitionsK;
-  static bool constexpr kStoreD0 = StoreD0;
-  static bool constexpr kStoreD1 = StoreD1;
-  using OutputTileIterator = OutputTileIterator_;
-  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
-  using WarpTileIterator = WarpTileIterator_;
-  using SharedLoadIterator = SharedLoadIterator_;
-  using OutputOp0 = OutputOp0_;
-  using OutputOp1 = OutputOp1_;
-  using OutputOp2 = OutputOp2_;
-  using Padding = Padding_;
-  using Layout = layout::RowMajor;
-  using LongIndex = typename Layout::LongIndex;
-  /// The complete warp-level accumulator tile
-  using AccumulatorTile = typename Base::AccumulatorTile;
-  /// Accumulator element
-  using ElementAccumulator = typename WarpTileIterator::Element;
-  /// Output element
-  using ElementOutput = typename OutputTileIterator::Element;
-  /// Output access size
-  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
-  /// Tensor reference to destination tensor
-  using TensorRef = typename OutputTileIterator::TensorRef;
-  /// Tensor reference to sync tensor
-  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
-  /// Const tensor reference to source tensor
-  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
-  /// Array type used to output
-  using OutputAccessType = Array<
-    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-  /// Array type used by output functor
-  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
-  /// Number of warps
-  using WarpCount = typename Base::WarpCount;
-  struct SharedStorage {
-    using Element = typename WarpTileIterator::Element;
-    /// Tensor reference to shared memory allocation
-    using TensorRef = typename WarpTileIterator::TensorRef;
-    /// Logical shape of the shared memory tile written to by all warps.
-    using Shape = typename Base::Shape;
-    /// Shape of the shared memory allocation for the epilogue
-    using StorageShape = typename Base::SharedStorage::StorageShape;
-    //
-    // Data members
-    //
-    AlignedBuffer<Element, StorageShape::kCount> storage[2];
-    //
-    // Methods
-    //
-    /// Returns a tensor reference to the shared memory buffer
-    CUTLASS_DEVICE
-    TensorRef reference(int i) {
-      return TensorRef(
-        storage[i].data(),
-        Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
-    }
-  };
-  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
-  static int constexpr kSmemPointerOffset = SharedStorage::StorageShape::kCount / kSmemTiles;
-public:
-  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
-    "Mismatch between shared load iterator and output tile iterator.");
-  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
-  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
-    "Divisibility");
-private:
-  /// Loads fragment from shared memory aligned with output tensor
-  SharedLoadIterator shared_load_iterator0_;
-  SharedLoadIterator shared_load_iterator1_;
-  /// Stores a warp's fragment of accumulators to SMEM
-  WarpTileIterator warp_tile_iterator0_;
-  WarpTileIterator warp_tile_iterator1_;
-public:
-  /// Constructor
-  CUTLASS_DEVICE
-  DualEpilogue(
-    SharedStorage &shared_storage,    ///< Shared storage object
-    int thread_idx,                   ///< ID of a thread within the threadblock
-    int warp_idx,                     ///< ID of warp within threadblock
-    int lane_idx                     ///< Id of thread within warp
-  ):
-    shared_load_iterator0_(shared_storage.reference(0), thread_idx),
-    shared_load_iterator1_(shared_storage.reference(1), thread_idx),
-    warp_tile_iterator0_(shared_storage.reference(0), lane_idx),
-    warp_tile_iterator1_(shared_storage.reference(1), lane_idx)
-  {
-    int warp_k = warp_idx / (WarpCount::kM * WarpCount::kN);
-    int warp_mn = warp_idx % (WarpCount::kM * WarpCount::kN);
-    int warp_m = warp_mn % WarpCount::kM;
-    int warp_n = warp_mn / WarpCount::kM;
-    MatrixCoord warp_offset{warp_k * WarpCount::kM + warp_m, warp_n};
-    warp_tile_iterator0_.add_tile_offset(warp_offset);
-    warp_tile_iterator1_.add_tile_offset(warp_offset);
-  }
-  /// Streams the result to global memory
-  CUTLASS_DEVICE
-  void operator()(
-    OutputOp0 const &output_op0,
-    OutputOp1 const &output_op1,
-    OutputOp2 const &output_op2,
-    OutputTileIterator dest0,
-    OutputTileIterator dest1,
-    OutputTileIterator dest2,
-    AccumulatorTile const &accumulator0,
-    AccumulatorTile const &accumulator1,
-    OutputTileIterator source_iterator[2],
-    bool writeToD2 // true if it's the final split-k
-  ) {
-    // TODO: Implement when no source is needed
-    typename OutputTileIterator::Fragment source_fragment[2];
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 2; ++i) {
-      source_fragment[i].clear();
-    }
-    //
-    // Iterator over warp-level accumulator fragment
-    //
-    AccumulatorFragmentIterator accum_fragment_iterator[2] = {accumulator0, accumulator1};
-    //
-    // Iterate over accumulator tile
-    //
-    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
-    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
-      //
-      // Load the source
-      //
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < 2; ++i) {
-        source_iterator[i].load(source_fragment[i]);
-        ++source_iterator[i];
-      }
-      //
-      // Convert and store fragment
-      //
-      __syncthreads();
-      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
-          iter, accum_fragment_iterator[0], this->warp_tile_iterator0_);
-      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
-          iter, accum_fragment_iterator[1], this->warp_tile_iterator1_);
-      __syncthreads();
-      //
-      // Load fragments from shared memory
-      //
-      typename SharedLoadIterator::Fragment aligned_accum_fragment0[kPartitionsK];
-      typename SharedLoadIterator::Fragment aligned_accum_fragment1[kPartitionsK];
-      shared_load_iterator0_.load(aligned_accum_fragment0[0]);
-      shared_load_iterator1_.load(aligned_accum_fragment1[0]);
-      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
-      if (kPartitionsK > 1) {
-        plus <typename SharedLoadIterator::Fragment> add_fragments;
-        CUTLASS_PRAGMA_UNROLL
-        for ( int i = 1; i < kPartitionsK; ++i) {
-          shared_load_iterator0_.add_pointer_offset(kSmemPointerOffset);
-          shared_load_iterator1_.add_pointer_offset(kSmemPointerOffset);
-          shared_load_iterator0_.load(aligned_accum_fragment0[i]);
-          shared_load_iterator1_.load(aligned_accum_fragment1[i]);
-          aligned_accum_fragment0[0] = add_fragments(aligned_accum_fragment0[0], aligned_accum_fragment0[i]);
-          aligned_accum_fragment1[0] = add_fragments(aligned_accum_fragment1[0], aligned_accum_fragment1[i]);
-        }
-        shared_load_iterator0_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
-        shared_load_iterator1_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
-      }
-      //
-      // Compute the output result
-      //
-      typename OutputTileIterator::Fragment output_fragment[3];
-      apply_output_operator_(output_fragment,
-        output_op0, output_op1, output_op2,
-        aligned_accum_fragment0[0], aligned_accum_fragment1[0],
-        source_fragment);
-      //
-      // Store the final result
-      //
-      if (kStoreD0) {
-        dest0.store(output_fragment[0]);
-        ++dest0;
-      }
-      if (kStoreD1) {
-        dest1.store(output_fragment[1]);
-        ++dest1;
-      }
-      if (writeToD2) {
-        dest2.store(output_fragment[2]);
-        ++dest2;
-      }
-    }
-  }
-private:
-  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
-  template<class Seq>
-  struct acc2smem_source_needed;
-  template <size_t... Seq>
-  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
-    template<int Advance>
-    CUTLASS_DEVICE
-    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
-                       WarpTileIterator &warp_tile_iterator) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < Advance; i++) {
-        ++accum_fragment_iterator;
-      }
-      typename AccumulatorFragmentIterator::Fragment accum_fragment;
-      accum_fragment_iterator.load(accum_fragment);
-      warp_tile_iterator.store(accum_fragment);
-    }
-    CUTLASS_DEVICE
-    static void push(size_t pos,
-                     AccumulatorFragmentIterator const &iterator_begin,
-                     WarpTileIterator &warp_tile_iterator) {
-      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
-    }
-  };
-  /// Helper to invoke the output functor over each vector of output
-  CUTLASS_DEVICE
-  void apply_output_operator_(
-    typename OutputTileIterator::Fragment (&output_fragment)[3],
-    OutputOp0 const &output_op0,
-    OutputOp1 const &output_op1,
-    OutputOp2 const &output_op2,
-    typename SharedLoadIterator::Fragment const& aligned_accum_fragment0,
-    typename SharedLoadIterator::Fragment const& aligned_accum_fragment1,
-    typename OutputTileIterator::Fragment const (&source_fragment)[2]) {
-    OutputAccessType* output_frag_ptr[3] = {
-      reinterpret_cast<OutputAccessType *>(&output_fragment[0]),
-      reinterpret_cast<OutputAccessType *>(&output_fragment[1]),
-      reinterpret_cast<OutputAccessType *>(&output_fragment[2])
-    };
-    AccumulatorAccessType const *compute_frag_ptr[2] = {
-      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment0),
-      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment1)
-    };
-    OutputAccessType const *source_frag_ptr[2] = {
-      reinterpret_cast<OutputAccessType const *>(&source_fragment[0]),
-      reinterpret_cast<OutputAccessType const *>(&source_fragment[1])
-    };
-    int const kOutputOpIterations =
-      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kOutputOpIterations; ++i) {
-      // Call the output operators
-      output_frag_ptr[0][i] = output_op0(compute_frag_ptr[0][i], source_frag_ptr[0][i]);
-      output_frag_ptr[1][i] = output_op1(compute_frag_ptr[1][i], source_frag_ptr[1][i]);
-      output_frag_ptr[2][i] = output_op2(output_frag_ptr[0][i], output_frag_ptr[1][i]);
-    }
-  }
-};
-////////////////////////////////////////////////////////////////////////////////
-} // namespace threadblock
-} // namespace epilogue
-} // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/threadblock/dual_mma_base.h DELETED Viewed

@@ -1,232 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-#pragma once
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/gemm/threadblock/mma_base.h"
-////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-////////////////////////////////////////////////////////////////////////////////
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy0_,
-    /// B1-specific version of the policy (concept: MmaPolicy)
-    typename Policy1_,
-    /// Number of stages,
-    int Stages,
-    /// Used for partial specialization
-    typename Enable = bool>
-class DualMmaBase {
- public:
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Policy describing tuning details
-  using Policy0 = Policy0_;
-  using Policy1 = Policy1_;
-  //
-  // Dependent types
-  //
-  /// Warp-level Mma
-  using Operator0 = typename Policy0::Operator;
-  using Operator1 = typename Policy1::Operator;
-  /// Shape describing the overall GEMM computed from shared memory
-  /// by each warp.
-  using WarpGemm = typename Policy0::Operator::Shape;
-  /// Shape describing the number of warps filling the CTA
-  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
-                              Shape::kN / WarpGemm::kN,
-                              Shape::kK / WarpGemm::kK>;
-  /// Number of warp-level GEMM oeprations
-  static int const kWarpGemmIterations =
-      (WarpGemm::kK / Operator0::Policy::MmaShape::kK);
-  /// Number of stages
-  static int const kStages = Stages;
-  /// Tensor reference to the A operand
-  using TensorRefA = TensorRef<typename Operator0::ElementA, typename Operator0::LayoutA>;
-  /// Tensor reference to the B operand
-  using TensorRefB0 = TensorRef<typename Operator0::ElementB, typename Operator0::LayoutB>;
-  using TensorRefB1 = TensorRef<typename Operator1::ElementB, typename Operator1::LayoutB>;
-  static_assert(kWarpGemmIterations > 1,
-                "The pipelined structure requires at least two warp-level "
-                "GEMM operations.");
-  static_assert((kWarpGemmIterations % 2) == 0,
-                "Inner loop iteration must be an even number.");
-  //
-  // Nested structs
-  //
-  /// Shared storage object needed by threadblock-scoped GEMM
-  class SharedStorage {
-   public:
-    //
-    // Type definitions
-    //
-    /// Shape of the A matrix operand in shared memory
-    using ShapeA = MatrixShape<Shape::kM + Policy0::SmemPaddingA::kRow,
-                               Shape::kK * kStages +
-                                   Policy0::SmemPaddingA::kColumn>;
-    /// Shape of the B matrix operand in shared memory
-    using ShapeB0 =
-        MatrixShape<Shape::kK * kStages + Policy0::SmemPaddingB::kRow,
-                    Shape::kN + Policy0::SmemPaddingB::kColumn>;
-    using ShapeB1 =
-        MatrixShape<Shape::kK * kStages + Policy1::SmemPaddingB::kRow,
-                    Shape::kN + Policy1::SmemPaddingB::kColumn>;
-   public:
-    //
-    // Data members
-    //
-    /// Buffer for A operand
-    AlignedBuffer<typename Operator0::ElementA, ShapeA::kCount> operand_A;
-    /// Buffer for B operand
-    AlignedBuffer<typename Operator0::ElementB, ShapeB0::kCount> operand_B0;
-    AlignedBuffer<typename Operator1::ElementB, ShapeB1::kCount> operand_B1;
-   public:
-    //
-    // Methods
-    //
-    /// Returns a layout object for the A matrix
-    CUTLASS_DEVICE
-    static typename Operator0::LayoutA LayoutA() {
-      return Operator0::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
-    }
-    /// Returns a layout object for the B matrix
-    CUTLASS_HOST_DEVICE
-    static typename Operator0::LayoutB LayoutB0() {
-      return Operator0::LayoutB::packed({ShapeB0::kRow, ShapeB0::kColumn});
-    }
-    /// Returns a layout object for the B matrix
-    CUTLASS_HOST_DEVICE
-    static typename Operator1::LayoutB LayoutB1() {
-      return Operator1::LayoutB::packed({ShapeB1::kRow, ShapeB1::kColumn});
-    }
-    /// Returns a TensorRef to the A operand
-    CUTLASS_HOST_DEVICE
-    TensorRefA operand_A_ref() {
-      return TensorRefA{operand_A.data(), LayoutA()};
-    }
-    /// Returns a TensorRef to the B operand
-    CUTLASS_HOST_DEVICE
-    TensorRefB0 operand_B0_ref() {
-      return TensorRefB0{operand_B0.data(), LayoutB0()};
-    }
-    CUTLASS_HOST_DEVICE
-    TensorRefB1 operand_B1_ref() {
-      return TensorRefB1{operand_B1.data(), LayoutB1()};
-    }
-  };
- protected:
-  //
-  // Data members
-  //
-  /// Iterator to load a warp-scoped tile of A operand from shared memory
-  typename Operator0::IteratorA warp_tile_iterator_A_;
-  /// Iterator to load a warp-scoped tile of B operand from shared memory
-  typename Operator0::IteratorB warp_tile_iterator_B0_;
-  typename Operator1::IteratorB warp_tile_iterator_B1_;
-public:
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  DualMmaBase(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
-      warp_tile_iterator_B0_(shared_storage.operand_B0_ref(), lane_idx),
-      warp_tile_iterator_B1_(shared_storage.operand_B1_ref(), lane_idx) {
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/45_dual_gemm/threadblock/dual_mma_multistage.h DELETED Viewed

@@ -1,775 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
-*/
-#pragma once
-#include "cutlass/aligned_buffer.h"
-#include "cutlass/arch/memory.h"
-#include "cutlass/array.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/gemm/threadblock/mma_base.h"
-#include "dual_mma_base.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace gemm {
-namespace threadblock {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Structure to compute the matrix product targeting CUDA cores and SIMT math
-/// instructions.
-template <
-    /// Size of the Gemm problem - concept: gemm::GemmShape<>
-    typename Shape_,
-    /// Iterates over tiles of A operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorA_,
-    /// Iterates over tiles of A operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorA_,
-    /// Cache operation for operand A
-    cutlass::arch::CacheOperation::Kind CacheOpA,
-    /// Iterates over tiles of B0 operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB0_,
-    /// Iterates over tiles of B0 operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB0_,
-    /// Cache operation for operand B
-    cutlass::arch::CacheOperation::Kind CacheOpB,
-    /// Iterates over tiles of B1 operand in global memory
-    //  (concept: ReadableTileIterator | ForwardTileIterator |
-    //  MaskedTileIterator)
-    typename IteratorB1_,
-    /// Iterates over tiles of B1 operand in shared memory
-    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
-    typename SmemIteratorB1_,
-    /// Data type of accumulator matrix
-    typename ElementC_,
-    /// Data type of accumulator matrix
-    typename LayoutC_,
-    /// Policy describing tuning details (concept: MmaPolicy)
-    typename Policy0_,
-    /// B1-specific version of the policy (concept: MmaPolicy)
-    typename Policy1_,
-    /// Number of stages,
-    int Stages,
-    /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
-    /// Used for partial specialization
-    typename Enable = bool>
-class DualMmaMultistage :
-  public DualMmaBase<Shape_, Policy0_, Policy1_, Stages> {
-public:
-  ///< Base class
-  using Base = DualMmaBase<Shape_, Policy0_, Policy1_, Stages>;
-  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
-  using Shape = Shape_;
-  ///< Iterates over tiles of A operand in global memory
-  using IteratorA = IteratorA_;
-  ///< Iterates over tiles of B0 operand in global memory
-  using IteratorB0 = IteratorB0_;
-  ///< Iterates over tiles of B1 operand in global memory
-  using IteratorB1 = IteratorB1_;
-  ///< Data type of accumulator matrix
-  using ElementC = ElementC_;
-  ///< Layout of accumulator matrix
-  using LayoutC = LayoutC_;
-  ///< Policy describing tuning details
-  using Policy0 = Policy0_;
-  using Policy1 = Policy1_;
-  using SmemIteratorA = SmemIteratorA_;
-  using SmemIteratorB0 = SmemIteratorB0_;
-  using SmemIteratorB1 = SmemIteratorB1_;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
-  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
-  //
-  // Dependent types
-  //
-  /// Fragment of accumulator tile
-  using FragmentC = typename Policy0::Operator::FragmentC;
-  /// Warp-level Mma
-  using Operator0 = typename Policy0::Operator;
-  using Operator1 = typename Policy1::Operator;
-  /// Minimum architecture is Sm80 to support cp.async
-  using ArchTag = arch::Sm80;
-  /// Complex transform on A operand
-  static ComplexTransform const kTransformA = Operator0::kTransformA;
-  /// Complex transform on B operand
-  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
-  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
-  /// Internal structure exposed for introspection.
-  struct Detail {
-    /// Number of cp.async instructions to load one stage of operand A
-    static int const AsyncCopyIterationsPerStageA =
-        IteratorA::ThreadMap::Iterations::kCount;
-    /// Number of cp.async instructions to load one stage of operand B
-    static int const AsyncCopyIterationsPerStageB =
-        IteratorB0::ThreadMap::Iterations::kCount;
-    /// Number of stages
-    static int const kStages = Stages;
-    /// Number of cp.async instructions to load on group of operand A
-    static int const kAccessesPerGroupA =
-        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-    /// Number of cp.async instructions to load on group of operand B
-    static int const kAccessesPerGroupB =
-        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
-  };
- private:
-  using WarpLoadedFragmentA = typename Operator0::FragmentA;
-  using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
-  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
-  using WarpTransformedFragmentA = typename Operator0::TransformedFragmentA;
-  using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
-  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
- private:
-  //
-  // Data members
-  //
-  /// Iterator to write threadblock-scoped tile of A operand to shared memory
-  SmemIteratorA smem_iterator_A_;
-  /// Iterator to write threadblock-scoped tile of B operand to shared memory
-  SmemIteratorB0 smem_iterator_B0_;
-  SmemIteratorB1 smem_iterator_B1_;
-public:
-  /// Construct from tensor references
-  CUTLASS_DEVICE
-  DualMmaMultistage(
-      ///< Shared storage needed for internal use by threadblock-scoped GEMM
-      typename Base::SharedStorage &shared_storage,
-      ///< ID within the threadblock
-      int thread_idx,
-      ///< ID of warp
-      int warp_idx,
-      ///< ID of each thread within a warp
-      int lane_idx
-    ):
-      Base(shared_storage, thread_idx, warp_idx, lane_idx),
-      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
-      smem_iterator_B0_(shared_storage.operand_B0_ref(), thread_idx),
-      smem_iterator_B1_(shared_storage.operand_B1_ref(), thread_idx)
-  {
-    // Compute warp location within threadblock tile by mapping the warp_id to
-    // three coordinates:
-    //   _m: the warp's position within the threadblock along the M dimension
-    //   _n: the warp's position within the threadblock along the N dimension
-    //   _k: the warp's position within the threadblock along the K dimension
-    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
-    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
-    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
-    // Add per-warp offsets in units of warp-level tiles
-    this->warp_tile_iterator_A_.add_tile_offset(
-        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
-    this->warp_tile_iterator_B0_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-    this->warp_tile_iterator_B1_.add_tile_offset(
-        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
-  }
-  CUTLASS_DEVICE
-  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB0 &iterator_B0, IteratorB1 &iterator_B1,
-                              int group_start_A = 0, int group_start_B = 0) {
-    iterator_A.set_iteration_index(group_start_A *
-                                   IteratorA::kAccessesPerVector);
-    this->smem_iterator_A_.set_iteration_index(group_start_A);
-    // Async Copy for operand A
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
-      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
-                              IteratorA::ThreadMap::kElementsPerAccess /
-                              IteratorA::kAccessesPerVector / 8;
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_A.get();
-          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, gmem_ptr, iterator_A.valid());
-          } else {
-            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
-                dst_ptr + v, gmem_ptr, iterator_A.valid());
-          }
-          ++iterator_A;
-        }
-        ++this->smem_iterator_A_;
-      }
-    }
-    iterator_B0.set_iteration_index(group_start_B *
-                                   IteratorB0::kAccessesPerVector);
-    iterator_B1.set_iteration_index(group_start_B *
-                                   IteratorB1::kAccessesPerVector);
-    this->smem_iterator_B0_.set_iteration_index(group_start_B);
-    this->smem_iterator_B1_.set_iteration_index(group_start_B);
-    // Async Copy for operand B0
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
-        typename IteratorB0::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB0::AccessType *>(
-                this->smem_iterator_B0_.get());
-        int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
-                              IteratorB0::ThreadMap::kElementsPerAccess /
-                              IteratorB0::kAccessesPerVector / 8;
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_B0.get();
-          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, iterator_B0.valid());
-          } else {
-            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, iterator_B0.valid());
-          }
-          ++iterator_B0;
-        }
-        ++this->smem_iterator_B0_;
-      }
-    }
-    // Async Copy for operand B1
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
-      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
-        typename IteratorB1::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB1::AccessType *>(
-                this->smem_iterator_B1_.get());
-        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
-                              IteratorB1::ThreadMap::kElementsPerAccess /
-                              IteratorB1::kAccessesPerVector / 8;
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
-          auto gmem_ptr = iterator_B1.get();
-          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
-            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, iterator_B1.valid());
-          } else {
-            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
-                dst_ptr + v, gmem_ptr, iterator_B1.valid());
-          }
-          ++iterator_B1;
-        }
-        ++this->smem_iterator_B1_;
-      }
-    }
-  }
-  /// Perform a threadblock-scoped matrix multiply-accumulate
-  CUTLASS_DEVICE
-  void operator()(
-      ///< problem size of GEMM
-      int gemm_k_iterations,
-      ///< destination accumulator tile
-      FragmentC &accum0,
-      FragmentC &accum1,
-      ///< iterator over A operand in global memory
-      IteratorA iterator_A,
-      ///< iterator over B operand in global memory
-      IteratorB0 iterator_B0,
-      IteratorB1 iterator_B1,
-      ///< initial value of accumulator
-      FragmentC const &src_accum0,
-      FragmentC const &src_accum1
-    ) {
-    //
-    // Prologue
-    //
-    // Issue several complete stages
-    CUTLASS_PRAGMA_UNROLL
-    for (int stage = 0; stage < Base::kStages - 1;
-         ++stage, --gemm_k_iterations) {
-      iterator_A.clear_mask(gemm_k_iterations == 0);
-      iterator_B0.clear_mask(gemm_k_iterations == 0);
-      iterator_B1.clear_mask(gemm_k_iterations == 0);
-      iterator_A.set_iteration_index(0);
-      this->smem_iterator_A_.set_iteration_index(0);
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                this->smem_iterator_A_.get());
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorA::Element>::value *
-              IteratorA::ThreadMap::kElementsPerAccess /
-              IteratorA::kAccessesPerVector / 8;
-          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
-              dst_ptr + v, iterator_A.get(), iterator_A.valid());
-          ++iterator_A;
-        }
-        ++this->smem_iterator_A_;
-      }
-      iterator_B0.set_iteration_index(0);
-      iterator_B1.set_iteration_index(0);
-      this->smem_iterator_B0_.set_iteration_index(0);
-      this->smem_iterator_B1_.set_iteration_index(0);
-      // Async Copy for operand B0
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB0::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB0::AccessType *>(
-                this->smem_iterator_B0_.get());
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorB0::Element>::value *
-              IteratorB0::ThreadMap::kElementsPerAccess /
-              IteratorB0::kAccessesPerVector / 8;
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-              dst_ptr + v, iterator_B0.get(), iterator_B0.valid());
-          ++iterator_B0;
-        }
-        ++this->smem_iterator_B0_;
-      }
-      // Async Copy for operand B1
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB1::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB1::AccessType *>(
-                this->smem_iterator_B1_.get());
-        CUTLASS_PRAGMA_UNROLL
-        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
-          int const kSrcBytes =
-              sizeof_bits<typename IteratorB1::Element>::value *
-              IteratorB1::ThreadMap::kElementsPerAccess /
-              IteratorB1::kAccessesPerVector / 8;
-          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
-              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
-          ++iterator_B1;
-        }
-        ++this->smem_iterator_B1_;
-      }
-      // Move to the next stage
-      iterator_A.add_tile_offset({0, 1});
-      iterator_B0.add_tile_offset({1, 0});
-      iterator_B1.add_tile_offset({1, 0});
-      this->smem_iterator_A_.add_tile_offset({0, 1});
-      this->smem_iterator_B0_.add_tile_offset({1, 0});
-      this->smem_iterator_B1_.add_tile_offset({1, 0});
-      // Defines the boundary of a stage of cp.async.
-      cutlass::arch::cp_async_fence();
-    }
-    // Perform accumulation in the 'd' output operand
-    accum0 = src_accum0;
-    accum1 = src_accum1;
-    //
-    // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
-    // so that all accumulator elements outside the GEMM footprint are zero.
-    //
-    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
-      /// Iterator to write threadblock-scoped tile of A operand to shared memory
-      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
-      typename IteratorA::AccessType zero_A;
-      zero_A.clear();
-      last_smem_iterator_A.set_iteration_index(0);
-      // Async Copy for operand A
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
-        typename IteratorA::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorA::AccessType *>(
-                last_smem_iterator_A.get());
-        *dst_ptr = zero_A;
-        ++last_smem_iterator_A;
-      }
-      typename IteratorB0::AccessType zero_B;
-      zero_B.clear();
-      /// Iterator to write threadblock-scoped tile of B0 operand to shared memory
-      SmemIteratorB0 last_smem_iterator_B0(this->smem_iterator_B0_);
-      last_smem_iterator_B0.set_iteration_index(0);
-      // Async Copy for operand B0
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB0::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB0::AccessType *>(
-                last_smem_iterator_B0.get());
-        *dst_ptr = zero_B;
-        ++last_smem_iterator_B0;
-      }
-      /// Iterator to write threadblock-scoped tile of B1 operand to shared memory
-      SmemIteratorB1 last_smem_iterator_B1(this->smem_iterator_B1_);
-      last_smem_iterator_B1.set_iteration_index(0);
-      // Async Copy for operand B1
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
-        typename IteratorB1::AccessType *dst_ptr =
-            reinterpret_cast<typename IteratorB1::AccessType *>(
-                last_smem_iterator_B1.get());
-        *dst_ptr = zero_B;
-        ++last_smem_iterator_B1;
-      }
-    }
-    // Waits until stages up to the previous (kStages-2)th stage have committed.
-    cutlass::arch::cp_async_wait<Base::kStages - 2>();
-    __syncthreads();
-    // Pair of fragments used to overlap shared memory loads and math
-    // instructions
-    WarpLoadedFragmentA warp_loaded_frag_A[2];
-    WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
-    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
-    WarpTransformedFragmentA warp_transformed_frag_A[2];
-    WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
-    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
-    Operator0 warp_mma0;
-    Operator1 warp_mma1;
-    this->warp_tile_iterator_A_.set_kgroup_index(0);
-    this->warp_tile_iterator_B0_.set_kgroup_index(0);
-    this->warp_tile_iterator_B1_.set_kgroup_index(0);
-    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
-    this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
-    this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
-    ++this->warp_tile_iterator_A_;
-    ++this->warp_tile_iterator_B0_;
-    ++this->warp_tile_iterator_B1_;
-    iterator_A.clear_mask(gemm_k_iterations == 0);
-    iterator_B0.clear_mask(gemm_k_iterations == 0);
-    iterator_B1.clear_mask(gemm_k_iterations == 0);
-    int smem_write_stage_idx = Base::kStages - 1;
-    int smem_read_stage_idx = 0;
-    warp_mma0.transform(warp_transformed_frag_A[0], warp_transformed_frag_B0[0],
-                        warp_loaded_frag_A[0], warp_loaded_frag_B0[0]);
-    warp_mma1.transform(warp_transformed_frag_A[0], warp_transformed_frag_B1[0],
-                        warp_loaded_frag_A[0], warp_loaded_frag_B1[0]);
-    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
-    // accumulator and this temporary accumulator is added to the final
-    // accumulator once in every mainloop iteration.
-    plus<FragmentC> plus_accum;
-    FragmentC tmp_accum0, tmp_accum1;
-    if (platform::is_same<typename Operator0::MathOperator,
-                          arch::OpMultiplyAddFastF32>::value
-      || platform::is_same<typename Operator0::MathOperator,
-                           arch::OpMultiplyAddComplexFastF32>::value) {
-      tmp_accum0.clear();
-      tmp_accum1.clear();
-    }
-    //
-    // Mainloop
-    //
-    CUTLASS_GEMM_LOOP
-    for (; gemm_k_iterations > (-Base::kStages + 1);) {
-      //
-      // Loop over GEMM K dimension
-      //
-      // Computes a warp-level GEMM on data held in shared memory
-      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
-      CUTLASS_PRAGMA_UNROLL
-      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
-           ++warp_mma_k) {
-        // Load warp-level tiles from shared memory, wrapping to k offset if
-        // this is the last group as the case may be.
-        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
-        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
-        this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
-        ++this->warp_tile_iterator_A_;
-        ++this->warp_tile_iterator_B0_;
-        ++this->warp_tile_iterator_B1_;
-        if (warp_mma_k > 0) {
-          warp_mma0.transform(warp_transformed_frag_A[warp_mma_k % 2],
-                              warp_transformed_frag_B0[warp_mma_k % 2],
-                              warp_loaded_frag_A[warp_mma_k % 2],
-                              warp_loaded_frag_B0[warp_mma_k % 2]);
-          warp_mma1.transform(warp_transformed_frag_A[warp_mma_k % 2],
-                              warp_transformed_frag_B1[warp_mma_k % 2],
-                              warp_loaded_frag_A[warp_mma_k % 2],
-                              warp_loaded_frag_B1[warp_mma_k % 2]);
-        }
-        if (platform::is_same<typename Operator0::MathOperator,
-                              arch::OpMultiplyAddFastF32>::value
-          || platform::is_same<typename Operator0::MathOperator,
-                               arch::OpMultiplyAddComplexFastF32>::value) {
-          warp_mma0(
-            tmp_accum0,
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B0[warp_mma_k % 2],
-            tmp_accum0
-          );
-          warp_mma1(
-            tmp_accum1,
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B1[warp_mma_k % 2],
-            tmp_accum1
-          );
-          if (warp_mma_k == 0) {
-            accum0 = plus_accum(accum0, tmp_accum0);
-            accum1 = plus_accum(accum1, tmp_accum1);
-            tmp_accum0.clear();
-            tmp_accum1.clear();
-          }
-        } else {
-          warp_mma0(
-            accum0,
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B0[warp_mma_k % 2],
-            accum0
-          );
-          warp_mma1(
-            accum1,
-            warp_transformed_frag_A[warp_mma_k % 2],
-            warp_transformed_frag_B1[warp_mma_k % 2],
-            accum1
-          );
-        }
-        // Issue global->shared copies for the this stage
-        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
-          int group_start_iteration_A, group_start_iteration_B;
-          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
-          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
-          copy_tiles_and_advance(iterator_A, iterator_B0, iterator_B1, group_start_iteration_A,
-                               group_start_iteration_B);
-        }
-        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
-          int group_start_iteration_A, group_start_iteration_B;
-          group_start_iteration_A =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
-          group_start_iteration_B =
-              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
-          copy_tiles_and_advance(iterator_A, iterator_B0, iterator_B1, group_start_iteration_A,
-                               group_start_iteration_B);
-          // Inserts a memory fence between stages of cp.async instructions.
-          cutlass::arch::cp_async_fence();
-          // Waits until stages up to the previous (kStages-2)th stage have committed.
-          arch::cp_async_wait<Base::kStages - 2>();
-          __syncthreads();
-          // Move to the next stage
-          iterator_A.add_tile_offset({0, 1});
-          iterator_B0.add_tile_offset({1, 0});
-          iterator_B1.add_tile_offset({1, 0});
-          this->smem_iterator_A_.add_tile_offset({0, 1});
-          this->smem_iterator_B0_.add_tile_offset({1, 0});
-          this->smem_iterator_B1_.add_tile_offset({1, 0});
-          // Add negative offsets to return iterators to the 'start' of the
-          // circular buffer in shared memory
-          if (smem_write_stage_idx == (Base::kStages - 1)) {
-            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
-            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
-            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
-            smem_write_stage_idx = 0;
-          } else {
-            ++smem_write_stage_idx;
-          }
-          if (smem_read_stage_idx == (Base::kStages - 1)) {
-            this->warp_tile_iterator_A_.add_tile_offset(
-                {0, -Base::kStages * Policy0::kPartitionsK *
-                        Base::kWarpGemmIterations});
-            this->warp_tile_iterator_B0_.add_tile_offset(
-                {-Base::kStages * Policy0::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            this->warp_tile_iterator_B1_.add_tile_offset(
-                {-Base::kStages * Policy1::kPartitionsK *
-                     Base::kWarpGemmIterations,
-                 0});
-            smem_read_stage_idx = 0;
-          } else {
-            ++smem_read_stage_idx;
-          }
-          --gemm_k_iterations;
-          iterator_A.clear_mask(gemm_k_iterations == 0);
-          iterator_B0.clear_mask(gemm_k_iterations == 0);
-          iterator_B1.clear_mask(gemm_k_iterations == 0);
-        }
-        // Do any conversions feeding the first stage at the end of the loop so
-        // we can start right away on mma instructions
-        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
-          warp_mma0.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                              warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
-                              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                              warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
-          warp_mma1.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
-                              warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
-                              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
-                              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
-        }
-      }
-    }
-    if (platform::is_same<typename Operator0::MathOperator,
-                          arch::OpMultiplyAddFastF32>::value
-      || platform::is_same<typename Operator0::MathOperator,
-                           arch::OpMultiplyAddComplexFastF32>::value) {
-      accum0 = plus_accum(accum0, tmp_accum0);
-      accum1 = plus_accum(accum1, tmp_accum1);
-    }
-    // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
-    cutlass::arch::cp_async_fence();
-    cutlass::arch::cp_async_wait<0>();
-    __syncthreads();
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-}  // namespace threadblock
-}  // namespace gemm
-}  // namespace cutlass
-/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/51_hopper_gett/gett_kernel.cuh DELETED Viewed

@@ -1,139 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "cute/tensor.hpp"
-#include "cutlass/arch/arch.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-#include "cutlass/epilogue/collective/collective_epilogue.hpp"
-#include "cutlass/epilogue/thread/linear_combination.h"
-namespace example {
-//
-// GETT entry point
-//
-template <
-  class ProblemShapeMNKL,
-  class ElementA,
-  class StrideA,
-  class ElementB,
-  class StrideB,
-  class ElementAccumulator,
-  class ElementC,
-  class StrideC,
-  class ElementD,
-  class StrideD,
-  class ElementEpilogue>
-cutlass::Status
-gett_kernel(
-    ProblemShapeMNKL problem_shape_mnkl,
-    ElementA const* ptr_A, StrideA stride_a_mkl,
-    ElementB const* ptr_B, StrideB stride_b_nkl,
-    ElementAccumulator _,
-    ElementC const* ptr_C, StrideC stride_c_mnl,
-    ElementD      * ptr_D, StrideD stride_d_mnl,
-    ElementEpilogue alpha, ElementEpilogue beta,
-    cudaStream_t stream = 0) {
-  using namespace cute;
-  // TileShape -- GETT configuration
-  // Specify the number of elements to take from each mode
-  // BLK_M = (M0,M1,...)  BLK_N = (M0,M1,...)  BLK_K = (K0,K1,...)
-  // Take 128 from m0, 128 from n0, 64 from k0
-  using TileShape = Shape<Shape<_128>, Shape<_128>, Shape<_64>>;
-  /* Other examples:
-   * Take 32 elements from m0 and 4 elements from m1
-   * Take 64 elements from n0 and 2 elements from n1
-   * Take  8 elements from k0 and 8 elements from k1
-  **/
-  // using TileShape = Shape<Shape<_32,_4>, Shape<_64,_2>, Shape<_8,_8>>;
-  using EpilogueThreadOp = cutlass::epilogue::thread::LinearCombination<
-      ElementD, 1, ElementAccumulator, ElementEpilogue, cutlass::epilogue::thread::ScaleType::Default,
-      cutlass::FloatRoundStyle::round_to_nearest, ElementC>;
-  // No changes are required to the default epilogue
-  using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
-    cutlass::epilogue::collective::DefaultEpilogue<
-      ElementC,
-      StrideC,
-      StrideD,
-      EpilogueThreadOp,
-      cutlass::gemm::EpilogueDefault>>;
-  // CollectiveMma for GETTs can be built using the CollectiveBuilders
-  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
-      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-      ElementA, StrideA, 128 / cutlass::sizeof_bits<ElementA>::value,
-      ElementB, StrideB, 128 / cutlass::sizeof_bits<ElementB>::value,
-      ElementAccumulator,
-      TileShape, Shape<_1,_2,_1>,
-      cutlass::gemm::collective::StageCountAutoCarveout<
-        static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-      cutlass::gemm::collective::KernelScheduleAuto
-    >::CollectiveOp;
-  // The GETT kernel is a composition of a collective mainloop and epilogue, just like any 3.x GEMM
-  using GettKernel = cutlass::gemm::kernel::GemmUniversal<
-      ProblemShapeMNKL,
-      CollectiveMainloop,
-      CollectiveEpilogue>;
-  using GettOperator = cutlass::gemm::device::GemmUniversalAdapter<GettKernel>;
-  typename GettOperator::Arguments args {
-    cutlass::gemm::GemmUniversalMode::kBatched,
-    problem_shape_mnkl,
-    { ptr_A, stride_a_mkl, ptr_B, stride_b_nkl },
-    { {alpha, beta}, ptr_C, stride_c_mnl, ptr_D, stride_d_mnl }
-  };
-#if CUTLASS_DEBUG_TRACE_LEVEL > 0
-  print("Problem shape:");
-  print("\tM: "); print(cute::get<0>(problem_shape_mnkl)); print("\n");
-  print("\tN: "); print(cute::get<1>(problem_shape_mnkl)); print("\n");
-  print("\tK: "); print(cute::get<2>(problem_shape_mnkl)); print("\n");
-  print("\tL: "); print(cute::get<3>(problem_shape_mnkl)); print("\n");
-  print("TileSape:"); print(TileShape{}); print("\n");
-#endif
-  GettOperator op;
-  return op(args, stream);
-}
-} // namespace example

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp DELETED Viewed

@@ -1,421 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cute/tensor.hpp"
-#include "gather_tensor.hpp"
-namespace cutlass {
-  ///Forward declaration
-  struct CudaHostAdapter;
-}
-namespace cutlass::gemm::kernel {
-///////////////////////////////////////////////////////////////////////////////
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_,
-  class GatherA_,
-  class GatherB_
->
-class GemmGather
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
-    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
-    "Non-persistent warp-specialized kernel does not support specializing the tile scheduler.");
-  using TileSchedulerTag = TileScheduler_;
-  using TileScheduler = typename detail::TileSchedulerSelector<
-    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-  using GatherA = GatherA_;
-  using GatherB = GatherB_;
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    union TensorStorage {
-      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
-      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
-      MainloopTensorStorage mainloop;
-      EpilogueTensorStorage epilogue;
-    } tensors;
-    struct PipelineStorage : cute::aligned_struct<16, _2> {
-      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
-      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
-      alignas(16) MainloopPipelineStorage mainloop;
-      alignas(16) EpiLoadPipelineStorage epi_load;
-    } pipelines;
-  };
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
-  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
-  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same.");
-  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
-  static constexpr uint32_t NumMmaWarpGroups = CUTE_STATIC_V(cute::size(TiledMma{})) / NumThreadsPerWarpGroup;
-  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
-  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
-  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-    GatherA gather_A{};
-    GatherB gather_B{};
-  };
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-    GatherA gather_A{};
-    GatherB gather_B{};
-  };
-  //
-  // Methods
-  //
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static
-  Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    (void) workspace;
-    auto problem_shape = args.problem_shape;
-    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
-      // swap M/N
-      get<0>(problem_shape) = get<1>(args.problem_shape);
-      get<1>(problem_shape) = get<0>(args.problem_shape);
-    }
-    return {
-      args.mode,
-      problem_shape,
-      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
-      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace),
-      args.gather_A,
-      args.gather_B
-    };
-  }
-  static bool
-  can_implement(Arguments const& args) {
-    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
-        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
-      return implementable;
-    }
-    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
-    return implementable;
-  }
-  static
-  size_t
-  get_workspace_size(Arguments const& args) {
-    return 0;
-  }
-  static
-  cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return Status::kSuccess;
-  }
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3
-  get_grid_shape(Params const& params) {
-    auto cluster_shape = Shape<_1,_1,_1>{};
-    auto tile_shape = TileShape{};
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    return TileScheduler::get_tiled_cta_shape_mnl(
-        problem_shape_MNKL, tile_shape, cluster_shape);
-  }
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-    // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-    #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
-      if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) {
-        printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
-        return;
-      }
-    #endif
-    enum class WarpGroupRole {
-      Producer = 0,
-      Consumer = 1,
-    };
-    // Kernel level shared memory storage
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-    int thread_idx = int(threadIdx.x);
-    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
-    int warp_group_idx = canonical_warp_group_idx();
-    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
-    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
-    // Mainloop Load pipeline
-    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
-    typename MainloopPipeline::Params mainloop_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
-    }
-    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
-    mainloop_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
-    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
-    // Epilogue Load pipeline
-    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
-    typename EpiLoadPipeline::Params epi_load_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
-    }
-    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
-    epi_load_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
-    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
-    // Epilogue Store pipeline
-    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
-    typename EpiStorePipeline::Params epi_store_pipeline_params;
-    epi_store_pipeline_params.always_wait = true;
-    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
-    // Initialize starting pipeline states for the collectives
-    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
-    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
-    // For the DMA Load (producer) we start with an opposite phase
-    // i.e., we skip all waits since we know that the buffer is indeed empty
-    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
-    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
-    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
-    // Preconditions
-    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    // Separate out problem shape for convenience
-    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    auto M = get<0>(problem_shape_MNKL);
-    auto N = get<1>(problem_shape_MNKL);
-    auto K = get<2>(problem_shape_MNKL);
-    auto L = get<3>(problem_shape_MNKL);
-    // Represent the full tensors
-    Tensor mA_mkl = make_gather_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA, params.gather_A); //(m,k,l)
-    Tensor mB_nkl = make_gather_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB, params.gather_B); //(n,k,l)
-    // Get the appropriate blocks for this thread block -- potential for thread block locality
-    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
-    TiledMma tiled_mma;
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
-    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
-    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
-    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
-    auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl));
-    auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-    // Slice with m_coord and n_coord
-    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
-    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                       // (BLK_N,BLK_K,k)
-    // Get pipeline iterators and increments from tensor shapes
-    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
-    auto k_tile_count = size<2>(gA);
-    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
-    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
-    // Wait for all threads in the thread block
-    __syncthreads();
-    // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
-    if (warp_group_role == WarpGroupRole::Producer) {
-      // Compute tile residues for predication
-      auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
-      auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
-      auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
-      auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
-      collective_mainloop.load(
-        mainloop_pipeline,
-        mainloop_pipe_producer_state,
-        gA,
-        gB,
-        k_tile_iter, k_tile_count,
-        residue_mnk,
-        thread_idx,
-        shared_storage.tensors.mainloop
-      );
-      // Update starting mainloop pipeline state for the pipeline drain
-      mainloop_pipe_producer_state.advance(k_tile_count);
-      // Make sure mainloop consumer has been waited upon before issuing epilogue load
-      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
-      if (collective_epilogue.is_producer_load_needed()) {
-        epi_load_pipe_producer_state =
-        collective_epilogue.load(
-          epi_load_pipeline,
-          epi_load_pipe_producer_state,
-          problem_shape_MNKL,
-          blk_shape,
-          blk_coord,
-          tiled_mma,
-          thread_idx,
-          shared_storage.tensors.epilogue
-        );
-        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
-      }
-    }
-    else if (warp_group_role == WarpGroupRole::Consumer) {
-      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
-      collective_mainloop.mma(
-        mainloop_pipeline,
-        mainloop_pipe_consumer_state,
-        accumulators,
-        k_tile_count,
-        warp_group_thread_idx,
-        shared_storage.tensors.mainloop,
-        params.mainloop
-      );
-      // Make sure the math instructions are done and free buffers before entering the epilogue
-      collective_mainloop.mma_tail(
-        mainloop_pipeline,
-        mainloop_pipe_consumer_state,
-        k_tile_count
-      );
-      // Epilogue and write to gD
-      collective_epilogue.store(
-        epi_load_pipeline,
-        epi_load_pipe_consumer_state,
-        epi_store_pipeline,
-        epi_store_pipe_producer_state,
-        problem_shape_MNKL,
-        blk_shape,
-        blk_coord,
-        accumulators,
-        tiled_mma,
-        warp_group_thread_idx,
-        shared_storage.tensors.epilogue
-      );
-    }
-  }
-};
-///////////////////////////////////////////////////////////////////////////////
-} // namespace cutlass::gemm::kernel

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh DELETED Viewed

@@ -1,136 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "cute/numeric/math.hpp"
-namespace example
-{
-// Naive grid-stride loop implementation of gather
-template<typename Element, typename Func>
-__global__ void
-gather_kernel(Element const * __restrict__ input,
-              Element       * __restrict__ output,
-              Func func,
-              int num_elems_input,
-              int num_elems_output,
-              cutlass::FastDivmod stride_divmod)
-{
-  Element const * input_b = input + blockIdx.z * num_elems_input;
-  Element * output_b = output + blockIdx.z * num_elems_output;
-  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int k = tidx; k < num_elems_output; k += blockDim.x * gridDim.x) {
-    int i,j;
-    stride_divmod(j, i, k);
-    output_b[k] = input_b[i + func(j) * stride_divmod.divisor];
-  }
-}
-// Gather elements along strided dimension of the tensor according to given indices
-template<typename Element, typename Func>
-void
-gather(Element const * input,
-       Element * output,
-       Func func,
-       int batch_size,
-       int num_elems_input,
-       int num_elems_output,
-       int stride,
-       cutlass::KernelHardwareInfo const& hw_info)
-{
-  // Upcast to uint128_t data type
-  int factor = 128 / cutlass::sizeof_bits<Element>::value;
-  assert(stride % factor == 0);
-  int stride_upcast = stride/factor;
-  int num_elems_input_upcast = num_elems_input / factor;
-  int num_elems_output_upcast = num_elems_output / factor;
-  cutlass::FastDivmod stride_divmod(stride_upcast);
-  dim3 blocks(hw_info.sm_count, 1, batch_size);
-  gather_kernel<<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input),
-                                  reinterpret_cast<cute::uint128_t *>(output),
-                                  func,
-                                  num_elems_input_upcast,
-                                  num_elems_output_upcast,
-                                  stride_divmod);
-}
-// Naive grid-stride loop implementation of scatter
-template<typename Element, typename Func>
-__global__ void
-scatter_kernel(Element const * __restrict__ input,
-               Element       * __restrict__ output,
-               Func func,
-               int num_elems_input,
-               int num_elems_output,
-               cutlass::FastDivmod stride_divmod)
-{
-  Element const * input_b = input + blockIdx.z * num_elems_input;
-  Element * output_b = output + blockIdx.z * num_elems_output;
-  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int k = tidx; k < num_elems_input; k += blockDim.x * gridDim.x) {
-    int i,j;
-    stride_divmod(j, i, k);
-    output_b[i + func(j) * stride_divmod.divisor] = input_b[k];
-  }
-}
-// Gather elements along strided dimension of the tensor according to given indices
-template<typename Element, typename Func>
-void
-scatter(Element const * input,
-        Element * output,
-        Func func,
-        int batch_size,
-        int num_elems_input,
-        int num_elems_output,
-        int stride,
-        cutlass::KernelHardwareInfo const& hw_info)
-{
-  // Upcast to uint128_t data type
-  int factor = 128 / cutlass::sizeof_bits<Element>::value;
-  assert(stride % factor == 0);
-  int stride_upcast = stride/factor;
-  int num_elems_input_upcast = num_elems_input / factor;
-  int num_elems_output_upcast = num_elems_output / factor;
-  cutlass::FastDivmod stride_divmod(stride_upcast);
-  dim3 blocks(hw_info.sm_count, 1, batch_size);
-  scatter_kernel<<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input),
-                                   reinterpret_cast<cute::uint128_t *>(output),
-                                   func,
-                                   num_elems_input_upcast,
-                                   num_elems_output_upcast,
-                                   stride_divmod);
-}
-} // namespace example

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/52_hopper_gather_scatter_fusion/scatter_epilogue.hpp DELETED Viewed

@@ -1,222 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief Functor performing elementwise operations used by epilogues.
-*/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cute/tensor.hpp"
-#include "cute/numeric/numeric_types.hpp"
-#include "gather_tensor.hpp"
-namespace cutlass::epilogue::collective {
-/// Applies an element wise operation to all elements within the fragment
-/// and scatter-writes them out to destination storage.
-/// GatherC and ScatterD are types of user-defined functions that apply the
-/// transoformation of the strided coordinate (e.g. through an index array).
-template <
-  class StrideC_,
-  class StrideD_,
-  class ThreadEpilogueOp_,
-  class EpilogueSchedule_,
-  class GatherC_,
-  class ScatterD_
->
-class EpilogueGatherScatter {
-public:
-  //
-  // Type Aliases
-  //
-  using EpilogueSchedule = EpilogueSchedule_;
-  // derived types of output thread level operator
-  using ThreadEpilogueOp = ThreadEpilogueOp_;
-  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
-  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
-  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
-  using ElementScalar = ElementCompute;
-  using ElementC = typename ThreadEpilogueOp::ElementC;
-  using StrideC = StrideC_;
-  using ElementD = typename ThreadEpilogueOp::ElementD;
-  using StrideD = StrideD_;
-  // Every epilogue needs these two GmemTiledCopy{C,D} aliases.
-  // If you don't know what they should be, just use void.
-  using GmemTiledCopyC = void;
-  using GmemTiledCopyD = void;
-  using GatherC = GatherC_;
-  using ScatterD = ScatterD_;
-  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
-  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
-  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
-  struct SharedStorage { };
-  // Host side epilogue arguments
-  struct Arguments {
-    typename ThreadEpilogueOp::Params thread_params{};
-    ElementC const* ptr_C = nullptr;
-    StrideC dC{};
-    ElementD* ptr_D = nullptr;
-    StrideD dD{};
-    GatherC gather_C{};
-    ScatterD scatter_D{};
-  };
-  // Device side epilogue params
-  using Params = Arguments;
-  //
-  // Methods
-  //
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(
-      [[maybe_unused]] ProblemShape const& _,
-      Arguments const& args,
-      [[maybe_unused]] void* workspace) {
-    return args;
-  }
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      [[maybe_unused]] ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    return true;
-  }
-  CUTLASS_HOST_DEVICE
-  EpilogueGatherScatter(Params const& params_) : params(params_) { }
-  template<
-    class ProblemShapeMNKL,
-    class BlockShapeMNK,
-    class BlockCoordMNKL,
-    class FrgEngine, class FrgLayout,
-    class TiledMma,
-    class ResidueMNK
-  >
-  CUTLASS_DEVICE void
-  operator()(
-      ProblemShapeMNKL problem_shape_mnkl,
-      BlockShapeMNK blk_shape_MNK,
-      BlockCoordMNKL blk_coord_mnkl,
-      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
-      TiledMma tiled_mma,
-      ResidueMNK residue_mnk,
-      int thread_idx,
-      char* smem_buf)
-  {
-    using namespace cute;
-    using X = Underscore;
-    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
-    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
-    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
-    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
-    (void) smem_buf;
-    ThreadEpilogueOp epilogue_op{params.thread_params};
-    // Separate out problem shape for convenience
-    auto M = get<0>(problem_shape_mnkl);
-    auto N = get<1>(problem_shape_mnkl);
-    auto L = get<3>(problem_shape_mnkl);
-    auto stride_c = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
-    auto stride_d = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
-    // Represent the full output tensor
-    Tensor mC_mnl = make_gather_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), stride_c, params.gather_C);  // (m,n,l)
-    Tensor mD_mnl = make_gather_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d, params.scatter_D); // (m,n,l)
-    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
-    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
-    // Slice to get the tile this CTA is responsible for
-    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
-    Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                 // (BLK_M,BLK_N)
-    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                 // (BLK_M,BLK_N)
-    // Partition source and destination tiles to match the accumulator partitioning
-    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
-    Tensor tCgD = thr_mma.partition_C(gD);                                       // (VEC,THR_M,THR_N)
-    Tensor tCgC = thr_mma.partition_C(gC);                                       // (VEC,THR_M,THR_N)
-    static_assert(is_static<FrgLayout>::value, "Accumulator layout must be static");
-    CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD),
-        "Source and destination must have the same number of elements.");
-    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
-        "Accumulator count must have the same destination element count.");
-    // Make an identity coordinate tensor for predicating our output MN tile
-    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
-    Tensor tCcD = thr_mma.partition_C(cD);
-    // source is needed
-    if (epilogue_op.is_source_needed()) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(accumulators); ++i) {
-        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
-          tCgD(i) = epilogue_op(accumulators(i), tCgC(i));
-        }
-      }
-    }
-    // source is not needed, avoid load
-    else {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < size(accumulators); ++i) {
-        if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
-          tCgD(i) = epilogue_op(accumulators(i));
-        }
-      }
-    }
-  }
-private:
-  Params params;
-};
-} // namespace cutlass::epilogue::collective

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/53_hopper_gemm_permute/permute_kernel.cuh DELETED Viewed

@@ -1,92 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Simple permutation kernel implementation.
-*/
-#include "cutlass/layout/pitch_linear.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/fast_math.h"
-#include "cute/numeric/numeric_types.hpp"
-namespace example
-{
-/**
- * Assumes column-major input (M mode is contiguous, N mode is strided).
- * For row major, the inputs must be switched accordingly.
-*/
-template<bool Batched, typename Element, typename Permute>
-__global__ void
-permute_kernel(Element const* __restrict__ input,
-               Element* __restrict__ output,
-               Permute permute,
-               int64_t num_elems,
-               cutlass::FastDivmod stride_divmod)
-{
-  // CUTLASS 2.x batched permute functions assume 0 batch stride for target tensor
-  Element const * input_b = input + blockIdx.z * num_elems;
-  Element * output_b = output + (Batched ? 0 : blockIdx.z * num_elems);
-  for (int64_t k = threadIdx.x + blockIdx.x * blockDim.x; k < num_elems; k += blockDim.x * gridDim.x)
-  {
-    int i, j;
-    stride_divmod(j, i, k);
-    output_b[permute(cutlass::PitchLinearCoord(i, j))] = input_b[i + j * stride_divmod.divisor];
-  }
-}
-template<bool Batched, typename Permute, typename Element>
-void permute(Element const* input,
-             Element * output,
-             int64_t num_elems,
-             int stride,
-             int batch_count,
-             cutlass::KernelHardwareInfo const& hw_info)
-{
-  // Upcast to uint128_t data type
-  int factor = 128 / cutlass::sizeof_bits<Element>::value;
-  assert(stride % factor == 0);
-  int stride_upcast = stride/factor;
-  int64_t num_elems_upcast = num_elems / factor;
-  Permute permute_upcast(cutlass::PitchLinearCoord(stride_upcast, int(num_elems_upcast/stride_upcast)), stride_upcast);
-  cutlass::FastDivmod stride_divmod(stride);
-  dim3 blocks(hw_info.sm_count, 1, batch_count);
-  permute_kernel<Batched><<<blocks, 1024>>>(reinterpret_cast<cute::uint128_t const *>(input),
-                                            reinterpret_cast<cute::uint128_t *>(output),
-                                            permute_upcast,
-                                            num_elems_upcast,
-                                            stride_upcast);
-}
-} // namespace example

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/53_hopper_gemm_permute/permute_traits.hpp DELETED Viewed

@@ -1,274 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Additional permutation information for the example.
-*/
-#include "cutlass/layout/permute.h"
-#include "cutlass/gemm/gemm.h"
-namespace example
-{
-using namespace cute;
-// This struct is specialized below for different CUTLASS 2.x permutation ops
-// to describe the operation in terms of target CuTe shape and stride order.
-template<class Permute>
-struct PermuteTraits {};
-// Use X as a placeholder for shape division result
-using X = Underscore;
-// Reshape a rank-2 shape into a multidimensional shape.
-// Input:
-//   shape = (A, B, ...)
-//   target_shape = ((A1, ..., X, ..., Am), (B1, ..., X, ..., Bn), ...)
-// Output:
-//   ((A1, ..., A/prod(A1..Am), ..., Am), (B1, ..., B/prod(B1..Bn), ..., Bn), ...)
-template<class Shape, class TargetShape>
-constexpr auto
-reshape(Shape const& shape, TargetShape const& target_shape)
-{
-  if constexpr (is_tuple<Shape>::value) {
-    return cute::transform(shape, target_shape, [](auto && s, auto && t){ return reshape(s, t); });
-  }
-  else {
-    auto idx = find_if(target_shape, [](auto x){ return is_underscore<decltype(x)>{}; });
-    constexpr int I = decltype(idx)::value;
-    static_assert(I < tuple_size_v<TargetShape>, "Each mode of TargetShape must contain a placeholder X");
-    auto divisors = remove<I>(target_shape);
-    assert(shape % product(divisors) == 0);
-    return replace<I>(target_shape, shape / product(divisors));
-  }
-}
-// Given a tensor layout, compute a permutation layout consisting of:
-// - sub-modes corresponding to the implied multidimensional shape of the source tensor
-// - strides accounting for the permutation operation being performed
-template<class Permute, bool Transpose, class Shape, class Stride>
-constexpr auto
-make_permute_layout(Layout<Shape,Stride> const& layout) {
-  static_assert(cute::rank(Shape{}) == 3, "Only rank-3 layouts are supported");
-  if constexpr (Transpose) {
-    // Deal with tensor B by transposing appropriately before and after computing the permute layout.
-    // Its CuTe-canonical mode order is [N,K,L], while permute operations expect [row,col,batch].
-    return select<1,0,2>(make_permute_layout<Permute, false>(select<1,0,2>(layout)));
-  }
-  else {
-    if constexpr (cutlass::layout::is_trivial_permute<Permute>) {
-      // Special case for NoPermute. Use a depth-2 layout for consistency with other permutations.
-      using ShapeProfile = tuple<tuple<X>, tuple<X>, tuple<X>>;
-      return unflatten(layout, ShapeProfile{});
-    }
-    else {
-      // Here's where the permutation layout is actually built
-      using ShapeProfile = typename PermuteTraits<Permute>::ShapeProfile;
-      using StrideOrder  = typename PermuteTraits<Permute>::StrideOrder;
-      return make_ordered_layout(reshape(layout.shape(), ShapeProfile{}), StrideOrder{});
-    }
-  }
-}
-namespace detail
-{
-template<int I>
-struct is_constant_pred {
-  template <class T>
-  constexpr auto operator()(T) {
-    return is_constant<I, T>{};
-  }
-};
-template<class Permutation, int... I>
-constexpr auto
-inverse_impl(Permutation const & perm, seq<I...>) {
-  return cute::make_tuple(Int<find_if(Permutation{}, is_constant_pred<I>{})>{}...);
-}
-} // namespace detail
-// Compute an inverse of a permutation represented as a tuple of cute::Int<>
-template<class Permutation>
-constexpr auto
-inverse(Permutation const & perm) {
-  auto flat_perm = flatten(perm);
-  return unflatten(detail::inverse_impl(flat_perm, tuple_seq<decltype(flat_perm)>{}), perm);
-}
-template<class T>
-using inverse_t = decltype(inverse(T{}));
-// Given a rank-2 layout of tensor that is assumed to have been permuted,
-// compute the original rank-2 layout of the tensor prior to the permutation.
-// This is needed to form the correct input to the standalone permutation kernel.
-template<class Permute, bool Transpose, class Shape, class Stride>
-constexpr auto
-make_original_layout(Layout<Shape,Stride> const& layout) {
-  static_assert(cute::rank(Shape{}) == 3, "Only rank-3 layouts are supported");
-  if constexpr (Transpose) {
-    // Deal with tensor B by transposing appropriately before and after computing the permute layout.
-    // Its CuTe-canonical mode order is [N,K,L], while permute operations expect [row,col,batch].
-    return select<1,0,2>(make_original_layout<Permute, false>(select<1,0,2>(layout)));
-  }
-  else {
-    using ShapeProfile = typename PermuteTraits<Permute>::ShapeProfile;
-    auto re_shape   = flatten(reshape(layout.shape(), ShapeProfile{}));
-    using IndexOrder   = typename PermuteTraits<Permute>::IndexOrder;
-    auto orig_shape = transform_leaf(IndexOrder{}, [&](auto i){ return get<i>(re_shape); });
-    using OrigOrder    = conditional_t<cutlass::gemm::detail::is_major<0,Stride>(), seq<0,1,2>, seq<1,0,2>>;
-    // print("Permuted shape: "); print(reshape(layout.shape(), ShapeProfile{})); print("\n");
-    // print("Original shape: "); print(orig_shape); print("\n");
-    return make_ordered_layout(product_each(orig_shape), OrigOrder{});
-  }
-}
-/////////////// Tensor4DPermute0213 ////////////////////
-template<int D1, int D2>
-struct PermuteTraits<cutlass::layout::Tensor4DPermute0213ColumnMajor<D1, D2>>
-{
-  static constexpr bool kBatched = false;
-  using ShapeProfile = Shape<Shape<X,Int<D1>>, Shape<Int<D2>,X>, Shape<X>>;
-  using IndexOrder   = Step<Step<_0,_2>, Step<_1,_3>, Step<_4>>;
-  using StrideOrder = inverse_t<IndexOrder>; // Step<Step<_0,_2>, Step<_1,_3>, Step<_4>>;
-};
-template<int D1, int D2>
-struct PermuteTraits<cutlass::layout::Tensor4DPermute0213ColumnMajorInverse<D1, D2>>
-{
-  static constexpr bool kBatched = false;
-  using ShapeProfile = Shape<Shape<X,Int<D2>>, Shape<Int<D1>,X>, Shape<X>>;
-  using IndexOrder   = Step<Step<_0,_2>, Step<_1,_3>, Step<_4>>;
-  using StrideOrder  = inverse_t<IndexOrder>; // Step<Step<_0,_2>, Step<_1,_3>, Step<_4>>;
-};
-template<int D1, int D2>
-struct PermuteTraits<cutlass::layout::Tensor4DPermute0213RowMajor<D1, D2>>
-{
-  static constexpr bool kBatched = false;
-  using ShapeProfile = Shape<Shape<Int<D1>,X>, Shape<X,Int<D2>>, Shape<X>>;
-  using IndexOrder   = Step<Step<_1,_3>, Step<_0,_2>, Step<_4>>;
-  using StrideOrder  = Step<Step<_1,_3>, Step<_0,_2>, Step<_4>>;
-};
-template<int D1, int D2>
-struct PermuteTraits<cutlass::layout::Tensor4DPermute0213RowMajorInverse<D1, D2>>
-{
-  static constexpr bool kBatched = false;
-  using ShapeProfile = Shape<Shape<Int<D2>,X>, Shape<X,Int<D1>>, Shape<X>>;
-  using IndexOrder   = Step<Step<_1,_3>, Step<_0,_2>, Step<_4>>;
-  using StrideOrder  = Step<Step<_1,_3>, Step<_0,_2>, Step<_4>>;
-};
-/////////////// Tensor4DPermuteBMM0321 ////////////////////
-template<int D>
-struct PermuteTraits<cutlass::layout::Tensor4DPermuteBMM0321ColumnMajor<D>>
-{
-  static constexpr bool kBatched = true;
-  using ShapeProfile = Shape<Shape<X>, Shape<X>, Shape<Int<D>,X>>;
-  using IndexOrder   = Step<Step<_0,_2>, Step<_1>, Step<_3>>;
-  using StrideOrder  = Step<Step<_0>, Step<_2>, Step<_1,_3>>;
-};
-template<int D>
-struct PermuteTraits<cutlass::layout::Tensor4DPermuteBMM0321ColumnMajorInverse<D>>
-{
-  static constexpr bool kBatched = true;
-  using ShapeProfile = Shape<Shape<X,Int<D>>, Shape<X>, Shape<X>>;
-  using IndexOrder   = Step<Step<_0>, Step<_2>, Step<_1,_3>>;
-  using StrideOrder  = Step<Step<_0,_2>, Step<_1>, Step<_3>>;
-};
-/////////////// Tensor4DPermuteBMM0213 ////////////////////
-template<int D>
-struct PermuteTraits<cutlass::layout::Tensor4DPermuteBMM0213RowMajor<D>>
-{
-  static constexpr bool kBatched = true;
-  using ShapeProfile = Shape<Shape<X>, Shape<X>, Shape<Int<D>,X>>;
-  using IndexOrder   = Step<Step<_0>, Step<_1,_2>, Step<_3>>;
-  using StrideOrder  = Step<Step<_2>, Step<_0>, Step<_1,_3>>;
-};
-template<int D>
-struct PermuteTraits<cutlass::layout::Tensor4DPermuteBMM0213RowMajorInverse<D>>
-{
-  static constexpr bool kBatched = true;
-  using ShapeProfile = Shape<Shape<X>, Shape<X,Int<D>>, Shape<X>>;
-  using IndexOrder   = Step<Step<_0>, Step<_1>, Step<_2,_3>>;
-  using StrideOrder  = Step<Step<_1>, Step<_0,_2>, Step<_3>>;
-};
-/////////////// Tensor5DPermute02413 ////////////////////
-template<int D1, int D2, int D3>
-struct PermuteTraits<cutlass::layout::Tensor5DPermute02413ColumnMajor<D1, D2, D3>>
-{
-  static constexpr bool kBatched = false;
-  using ShapeProfile = Shape<Shape<X,Int<D1>>, Shape<Int<D2>,Int<D3>,X>, Shape<X>>;
-  using IndexOrder   = Step<Step<_0,_2>, Step<_4,_1,_3>, Step<_5>>;
-  using StrideOrder  = inverse_t<IndexOrder>; // Step<Step<_0,_3>, Step<_1,_4,_2>, Step<_5>>;
-};
-template<int D1, int D2, int D3>
-struct PermuteTraits<cutlass::layout::Tensor5DPermute02413ColumnMajorInverse<D1, D2, D3>>
-{
-  static constexpr bool kBatched = false;
-  using ShapeProfile = Shape<Shape<X,Int<D2>>, Shape<X,Int<D1>,Int<D3>>, Shape<X>>;
-  using IndexOrder   = Step<Step<_0,_3>, Step<_1,_4,_2>, Step<_5>>;
-  using StrideOrder  = inverse_t<IndexOrder>; // Step<Step<_0,_2>, Step<_4,_1,_3>, Step<_5>>;
-};
-/////////////// Tensor5DPermute20314 ////////////////////
-template<int D1, int D2, int D3>
-struct PermuteTraits<cutlass::layout::Tensor5DPermute20314RowMajor<D1, D2, D3>>
-{
-  static constexpr bool kBatched = false;
-  using ShapeProfile = Shape<Shape<Int<D1>,X>, Shape<X,Int<D3>,Int<D2>>, Shape<X>>;
-  using IndexOrder   = Step<Step<_2,_0>, Step<_3,_1,_4>, Step<_5>>;
-  using StrideOrder  = Step<Step<_1,_3>, Step<_0,_2,_4>, Step<_5>>;
-};
-template<int D1, int D2, int D3>
-struct PermuteTraits<cutlass::layout::Tensor5DPermute20314RowMajorInverse<D1, D2, D3>>
-{
-  static constexpr bool kBatched = false;
-  using ShapeProfile = Shape<Shape<X,Int<D2>>, Shape<X,Int<D1>,Int<D3>>, Shape<X>>;
-  using IndexOrder   = Step<Step<_3,_0>, Step<_2,_4,_1>, Step<_5>>;
-  using StrideOrder  = Step<Step<_4,_2>, Step<_0,_3,_1>, Step<_5>>;
-};
-} // namespace example

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/54_hopper_fp8_warp_specialized_gemm/hopper_fp8_commandline.hpp DELETED Viewed

@@ -1,129 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-// Command line options parsing
-template<typename RasterOrderOptions>
-struct Options {
-  bool help = false;
-  float alpha = 1.f, beta = 0.f;
-  float scale_a = 1.f, scale_b = 1.f, scale_c = 1.f, scale_d = 1.f, scale_aux = 1.f;
-  bool device_scale = false;
-  bool save_aux = true;
-  bool save_amax = true;
-  int iterations = 1000;
-  int m = 1024, n = 512, k = 1024, l = 1;
-  RasterOrderOptions raster;
-  int swizzle;
-  // Parses the command line
-  void parse(int argc, char const **args) {
-    cutlass::CommandLine cmd(argc, args);
-    if (cmd.check_cmd_line_flag("help")) {
-      help = true;
-      return;
-    }
-    cmd.get_cmd_line_argument("m", m);
-    cmd.get_cmd_line_argument("n", n);
-    cmd.get_cmd_line_argument("k", k);
-    cmd.get_cmd_line_argument("l", l);
-    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
-    cmd.get_cmd_line_argument("beta", beta, 0.f);
-    cmd.get_cmd_line_argument("scale_a", scale_a, 1.f);
-    cmd.get_cmd_line_argument("scale_b", scale_b, 1.f);
-    cmd.get_cmd_line_argument("scale_c", scale_c, 1.f);
-    cmd.get_cmd_line_argument("scale_d", scale_d, 1.f);
-    cmd.get_cmd_line_argument("scale_aux", scale_aux, 1.f);
-    cmd.get_cmd_line_argument("device_scale", device_scale, false);
-    cmd.get_cmd_line_argument("save_aux", save_aux, true);
-    cmd.get_cmd_line_argument("save_amax", save_amax, true);
-    cmd.get_cmd_line_argument("iterations", iterations);
-    char raster_char;
-    cmd.get_cmd_line_argument("raster", raster_char);
-    if (raster_char == 'N' || raster_char == 'n') {
-      raster = RasterOrderOptions::AlongN;
-    }
-    else if (raster_char == 'M' || raster_char == 'm') {
-      raster = RasterOrderOptions::AlongM;
-    }
-    else if (raster_char == 'H' || raster_char == 'h') {
-      raster = RasterOrderOptions::Heuristic;
-    }
-    cmd.get_cmd_line_argument("swizzle", swizzle, 1);
-  }
-  /// Prints the usage statement.
-  std::ostream & print_usage(std::ostream &out) const {
-    out << "54_fp8_hopper_warp_specialized_gemm\n\n"
-      << "  Hopper FP8 GEMM using a Warp Specialized kernel.\n\n"
-      << "Options:\n\n"
-      << "  --help                      If specified, displays this usage statement\n\n"
-      << "  --m=<int>                   Sets the M extent of the GEMM\n"
-      << "  --n=<int>                   Sets the N extent of the GEMM\n"
-      << "  --k=<int>                   Sets the K extent of the GEMM\n"
-      << "  --l=<int>                   Sets the l extent (batch) of the GEMM\n"
-      << "  --alpha=<f32>               Epilogue scalar alpha\n"
-      << "  --beta=<f32>                Epilogue scalar beta\n"
-      << "  --scale_a=<f32>             Scaling factor for A\n"
-      << "  --scale_b=<f32>             Scaling factor for B\n"
-      << "  --scale_c=<f32>             Scaling factor for C\n"
-      << "  --scale_d=<f32>             Scaling factor for D (ignored for non-fp8 D)\n"
-      << "  --scale_aux=<f32>           Scaling factor for the auxiliary tensor (ignored for non-fp8 aux)\n"
-      << "  --device_scale=<bool>       Copy scalars to device memory before kernel launch (default: false)\n"
-      << "  --save_aux=<bool>           Save the pre-activation as an auxiliary tensor (default: true)\n"
-      << "  --save_amax=<bool>          Save the pre-scaled max absolute value of any fp8 outputs (aux and/or D) (default: true)\n"
-      << "  --raster=<char>             CTA Rasterization direction (N for along N, M for along M, and H for heuristic)\n\n"
-      << "  --swizzle=<int>             CTA Rasterization swizzle\n\n"
-      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
-    out
-      << "\n\nExamples:\n\n"
-      << "$ " << "54_fp8_hopper_warp_specialized_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
-    return out;
-  }
-  /// Compute performance in GFLOP/s
-  double gflops(double runtime_s) const
-  {
-    // Two flops per multiply-add
-    uint64_t flop = uint64_t(2) * m * n * k;
-    double gflop = double(flop) / double(1.0e9);
-    return gflop / runtime_s;
-  }
-};

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp DELETED Viewed

@@ -1,246 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/util/command_line.h"
-#include "cutlass/util/reference/device/tensor_fill.h"
-#include "cutlass/util/reference/device/tensor_compare.h"
-#include "cute/tensor.hpp"
-#include <cuda.h>
-#include <numeric>
-#include "helper.h"
-enum MixedDtypeGemmMode {
-  ConvertOnly,
-  ScaleOnly,
-  ScaleWithZeroPoint
-};
-/// Command line options parsing
-struct MixedDtypeOptions {
-  bool help = false;
-  float alpha = 1.0f;
-  float beta = 0.0f;
-  int iterations = 100;
-  int warmup = 10;
-  int mode = 1;
-  int m = 5120, n = 4096, k = 4096;
-  int g = 128;
-  int l = 1;
-  // Parses the command line
-  void parse(int argc, char const **args) {
-    cutlass::CommandLine cmd(argc, args);
-    if (cmd.check_cmd_line_flag("help")) {
-      help = true;
-      return;
-    }
-    cmd.get_cmd_line_argument("m", m);
-    cmd.get_cmd_line_argument("n", n);
-    cmd.get_cmd_line_argument("k", k);
-    cmd.get_cmd_line_argument("l", l);
-    cmd.get_cmd_line_argument("g", g);
-    cmd.get_cmd_line_argument("mode", mode);
-    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
-    cmd.get_cmd_line_argument("beta", beta, 0.f);
-    cmd.get_cmd_line_argument("iterations", iterations);
-    cmd.get_cmd_line_argument("warmup", warmup);
-  }
-  /// Prints the usage statement.
-  std::ostream & print_usage(std::ostream &out) const {
-    out << "55_hopper_mixed_dtype_gemm\n\n"
-      << "  Hopper Mixed Data Type GEMM using a Warp Specialized kernel.\n\n"
-      << "Options:\n\n"
-      << "  --help                      If specified, displays this usage statement\n\n"
-      << "  --m=<int>                   Sets the M extent of the GEMM\n"
-      << "  --n=<int>                   Sets the N extent of the GEMM\n"
-      << "  --k=<int>                   Sets the K extent of the GEMM\n"
-      << "  --l=<int>                   The number of independent gemm problems with mnk shape\n"
-      << "  --g=<int>                   The size of each group for the scales and zeros. To broadcast a vector of scales or zeros, set the group size to K.\n"
-      << "  --mode=<int>                The mode to run the gemm. 0 does (A @ B), 1 means A @ (scale * B), 2 means A @ (scale * B + zero-point).\n"
-      << "  --alpha=<f32>               Epilogue scalar alpha\n"
-      << "  --beta=<f32>                Epilogue scalar beta\n\n"
-      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
-      << "  --warmup=<int>              Number of warmup iterations to perform.\n\n";
-    out
-      << "\n\nExamples:\n\n"
-      << "$ " << "55_hopper_mixed_dtype_gemm" << " --m=1024 --n=512 --k=1024 -g=1024 --l=10 --alpha=2 --mode=2 --beta=0.707 \n\n";
-    return out;
-  }
-  /// Compute performance in GFLOP/s
-  double gflops(double runtime_s) const
-  {
-    // Two flops per multiply-add
-    uint64_t flop = uint64_t(2) * m * n * k * l;
-    double gflop = double(flop) / double(1.0e9);
-    return gflop / runtime_s;
-  }
-};
-/// Result structure
-struct MixedDtypeResult
-{
-  double avg_runtime_ms = 0.0;
-  double gflops = 0.0;
-  cutlass::Status status = cutlass::Status::kSuccess;
-  cudaError_t error = cudaSuccess;
-  bool passed = false;
-};
-/// Profiling Loop
-template <class Gemm>
-void mixed_dtype_profiling(
-  Gemm& gemm,
-  MixedDtypeOptions const& options,
-  MixedDtypeResult& result) {
-  if (options.iterations <= 0) return;
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  std::vector<float> runtimes;
-  runtimes.reserve(options.iterations);
-  for (int iter = 0; iter < options.warmup + options.iterations; ++iter) {
-    cudaEventRecord(start);
-    CUTLASS_CHECK(gemm.run());
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    if (iter >= options.warmup) {
-      float milliseconds = 0;
-      cudaEventElapsedTime(&milliseconds, start, stop);
-      runtimes.push_back(milliseconds);
-    }
-  }
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-  // Compute average setup and runtime and GFLOPs.
-  result.avg_runtime_ms = std::accumulate(runtimes.begin(), runtimes.end(), 0.0f) / runtimes.size();
-  result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
-  std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
-  std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
-  std::cout << "  GFLOPS: " << result.gflops << std::endl;
-}
-/// Helpers to initialize a block of device data
-template <class Element>
-bool initialize_tensor(
-  cutlass::DeviceAllocation<Element>& block,
-  uint64_t seed = 2023) {
-  double scope_max, scope_min;
-  int bits_input = cutlass::sizeof_bits<Element>::value;
-  int bits_output = cutlass::sizeof_bits<Element>::value;
-  if (bits_input == 1) {
-    scope_max = 2;
-    scope_min = 0;
-  }
-  else if (bits_input <= 8) {
-    scope_max = 2;
-    scope_min = -2;
-  }
-  else if (bits_output == 16) {
-    scope_max = 5;
-    scope_min = -5;
-  }
-  else {
-    scope_max = 8;
-    scope_min = -8;
-  }
-  cutlass::reference::device::BlockFillRandomUniform(
-      block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
-  return true;
-}
-template <class Element>
-bool initialize_scale(
-  cutlass::DeviceAllocation<Element>& block,
-  MixedDtypeOptions const& options,
-  uint64_t seed = 2023) {
-  // If no scales, initialize with 1 so we can use the same kernel to dequantize the data
-  float scope_max = 1.0f, scope_min = 1.0f;
-  if (options.mode != MixedDtypeGemmMode::ConvertOnly) {
-    float elt_max_f = float(cutlass::platform::numeric_limits<Element>::max());
-    scope_max = 2.f;
-    scope_min = 0.1f;
-  }
-  cutlass::reference::device::BlockFillRandomUniform(
-    block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
-  return true;
-}
-template <class Element>
-bool initialize_zero(
-  cutlass::DeviceAllocation<Element>& block,
-  MixedDtypeOptions const& options,
-  uint64_t seed = 2023) {
-  // If no bias, initialize with 0 so we can use the same kernel to dequantize the data
-  float scope_max = 0.0f, scope_min = 0.0f;
-  if (options.mode == MixedDtypeGemmMode::ScaleWithZeroPoint) {
-    scope_max = 2.0f;
-    scope_min = -2.0f;
-  }
-  cutlass::reference::device::BlockFillRandomUniform(
-    block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
-  return true;
-}

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/59_ampere_gather_scatter_conv/ampere_conv_kernel.h DELETED Viewed

@@ -1,320 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/atom/copy_atom.hpp"
-#include <random>
-#include "cutlass/util/print_error.hpp"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/collective/collective_mma.hpp"
-using namespace cute;
-struct AmpereUnpredicatedFprop {
-  //
-  // Static config for conv problem shape
-  //
-  using D = _6;
-  using H = _4;
-  using W = _4;
-  using T = _3;
-  using R = _3;
-  using S = _3;
-  using Z = _4;
-  using P = _2;
-  using Q = _2;
-  using C = _64;
-  using K = _128;
-  // Tiler config
-  using Tiler_K = decltype(cute::min(K{}, _128{}));
-  using Tiler_C = decltype(cute::min(C{}, _32{}));
-  using Tiler_N = _4;
-  using TileM = Tiler_K;
-  using TileN = Shape<Tiler_N, Z, P, Q>;
-  using TileK = Shape<Tiler_C,_1,_1,_1>;
-  using PIPE  = _3;
-  using TilerFlt = Shape<TileM, TileK>;
-  using TilerAct = Shape<TileN, TileK>;
-  using TilerOut = Shape<TileM, TileN>;
-  using TileSizeM = Int<size(TileM{})>;
-  using TileSizeN = Int<size(TileN{})>;
-  using TileSizeK = Int<size(TileK{})>;
-  static constexpr int Stages = PIPE::value;
-  using ElementFlt = tfloat32_t;
-  using ElementAct = tfloat32_t;
-  using ElementOut = float;
-  using TiledMma = TiledMMA<
-    MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
-    Layout<Shape<_2,_2,_1>>,
-    Tile<_32,_32,Underscore>>;
-  static constexpr int MaxThreadsPerBlock = size(TiledMma{});
-  static constexpr int MinBlocksPerMultiprocessor = 1;
-  union SharedStorage {
-    struct {
-      ElementFlt sAMatrix[size(TileM{}) * size(TileK{}) * size(PIPE{})];
-      ElementAct sBMatrix[size(TileN{}) * size(TileK{}) * size(PIPE{})];
-    } mainloop;
-    struct {
-      ElementOut sCMatrix[size(TileM{}) * size(TileN{})];
-    } epilogue;
-  };
-  //
-  // Stencil tensor
-  //
-  using GmemLayoutFlt = decltype(make_ordered_layout(
-    Shape< K, Shape< C, T, R, S>>{},
-    tuple<_4, tuple<_0,_3,_2,_1>>{}));
-  // We have 64 elements * 32b each in the major mode that we can vectorize
-  // Max vector size is 128b, so lay 16 threads along the major mode with a vector size of 4
-  // Rest along the minor mode
-  using GmemTiledCopyFlt = decltype(make_tiled_copy(
-    Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, ElementFlt>{},
-    Layout<Shape <_16, _8>,
-           Stride< _8, _1>>{},
-    Layout<Shape < _1, _4>>{}));
-  // Following layout is also correct, but trades off dynamic strides in the slice for bank conflict free accesses
-  // using SmemLayoutFlt = decltype(
-  //     composition(Swizzle<3,2,3>{},
-  //                 make_ordered_layout(
-  //                     Shape<TileSizeM,TileSizeK,PIPE>{},
-  //                     tuple<       _1,       _0,  _2>{})));
-  using SmemLayoutAtomFlt = decltype(
-    composition(Swizzle<1,2,3>{},
-                Layout<Shape <_8,Shape <_4, _2>>,
-                       Stride<_4,Stride<_1,_32>>>{}));
-  using SmemCopyAtomFlt = Copy_Atom<SM75_U32x4_LDSM_N, ElementFlt>;
-  //
-  // Activation tensor
-  //
-  // Activation tensor is major in the contraction mode, so vectorize that mode first
-  // Then lay out the rest of the threads along the other mode
-  using GmemTiledCopyAct = decltype(make_tiled_copy(
-    Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, ElementAct>{},
-    Layout<Shape <_16, _8>,
-           Stride< _8, _1>>{},
-    Layout<Shape < _1, _4>>{}));
-  // Following layout is also correct, but trades off dynamic strides in the slice for bank conflict free accesses
-  // using SmemLayoutAct = decltype(
-  //     composition(Swizzle<3,2,3>{},
-  //                 make_ordered_layout(
-  //                     Shape<TileSizeN,TileSizeK,PIPE>{},
-  //                     tuple<       _1,       _0,  _2>{})));
-  using SmemLayoutAtomAct = decltype(
-    composition(Swizzle<1,2,3>{},
-                Layout<Shape <_8,Shape <_4, _2>>,
-                       Stride<_4,Stride<_1,_32>>>{}));
-  using SmemCopyAtomAct = Copy_Atom<SM75_U32x4_LDSM_N, ElementAct>;
-  //
-  // Output tensor
-  //
-  using GmemTiledCopyOut = decltype(make_tiled_copy(
-    Copy_Atom<UniversalCopy<uint128_t>, ElementAct>{},
-    Layout<Shape <_8, _16>,
-           Stride<_1,  _8>>{},
-    Layout<Shape <_4,  _1>>{}));
-  using SmemCopyAtomOut = Copy_Atom<UniversalCopy<uint32_t>, ElementOut>;
-  // This can be optimized to make accesses BCF, but we use a col-major layout here to show off composability
-  using SmemLayoutOut = Layout<Shape<TileSizeM, TileSizeN>>;
-  //
-  // Conv functor
-  //
-  template <class EngineFlt, class TensorActivation, class TensorOutput>
-  void __device__
-  operator()(cute::Tensor<EngineFlt, GmemLayoutFlt> mFlt, // ( K,        (C,T,R,S))
-             TensorActivation                       mAct, // ((N,Z,P,Q), (C,T,R,S))
-             TensorOutput                           mOut, // ( K,        (N,Z,P,Q))
-             char* smem_buf) const {
-    using namespace cute;
-    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveMma<
-        cutlass::gemm::MainloopSm80CpAsyncUnpredicated<PIPE::value>,
-        Shape<TileM,TileN,TileK>,
-        ElementFlt,
-        Underscore, // Ignore the stride, we are passing full cute::Tensor to operator()
-        ElementAct,
-        Underscore, // Ignore the stride, we are passing full cute::Tensor to operator()
-        TiledMma,
-        GmemTiledCopyFlt,
-        SmemLayoutAtomFlt,
-        SmemCopyAtomFlt,
-        cute::identity,
-        GmemTiledCopyAct,
-        SmemLayoutAtomAct,
-        SmemCopyAtomAct,
-        cute::identity>;
-    TiledMma tiled_mma;
-    Tensor accum = partition_fragment_C(tiled_mma, TilerOut{});
-    clear(accum);
-    // Set up tensors
-    // NOTE: blockIdx.x projects onto act-NDHW mode, y along the flt-K mode for the sake of higher dynamic range in NDHW
-    Tensor gA_mk = local_tile(mFlt, TilerFlt{}, make_coord(_,_));                              // (BLK_M,BLK_K,m',k')
-    Tensor gB_nk = local_tile(mAct, TilerAct{}, make_coord(_,_));                              // (BLK_N,BLK_K,n',_1)
-    Tensor gC_mn = local_tile(mOut, TilerOut{}, make_coord(_,_));                              // (BLK_M,BLK_N,m',n')
-    // Compute m_coord and n_coord with their post-tiled shapes
-    auto m_coord = idx2crd(int(blockIdx.y), shape<2>(gA_mk));
-    auto n_coord = idx2crd(int(blockIdx.x), shape<2>(gB_nk));
-    Tensor gA = gA_mk(_,_,m_coord,_);                                                          // (BLK_M,BLK_K,k')
-    Tensor gB = gB_nk(_,_,n_coord,_);                                                          // (BLK_N,BLK_K,_1)
-    Tensor gC = gC_mn(_,_,m_coord,n_coord);                                                    // (BLK_M,BLK_N)
-    auto k_tile_iter = cute::make_coord_iterator(size<2>(gA));
-    int k_tile_count = size<2>(gA);
-    CollectiveMainloop collective_mma;
-    collective_mma(
-      accum,
-      gA,
-      gB,
-      accum,
-      k_tile_iter, k_tile_count,
-      Underscore{}, // no residue since we do not support predication
-      threadIdx.x,
-      smem_buf);
-    //
-    // Epilogue
-    //
-    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-    Tensor sC = make_tensor(make_smem_ptr(&storage.epilogue.sCMatrix[0]), SmemLayoutOut{});
-    auto smem_tiled_copy_C = make_tiled_copy_C(SmemCopyAtomOut{}, tiled_mma);
-    auto smem_thr_copy_C = smem_tiled_copy_C.get_slice(threadIdx.x);
-    auto tCrC = smem_thr_copy_C.retile_S(accum);
-    auto tCsC = smem_thr_copy_C.partition_D(sC);
-    copy(smem_tiled_copy_C, tCrC, tCsC);
-    __syncthreads();
-    GmemTiledCopyOut gmem_tiled_copy_C;
-    auto gmem_thr_copy_C = gmem_tiled_copy_C.get_slice(threadIdx.x);
-    auto tDsC = gmem_thr_copy_C.partition_S(sC);
-    auto tDgC = gmem_thr_copy_C.partition_D(gC);
-    copy(gmem_tiled_copy_C, tDsC, tDgC);
-    #if 0
-      if (thread0()) {
-        print("mAct = "); print(mAct);          print('\n');
-        print("mFlt = "); print(mFlt);          print('\n');
-        print("mOut = "); print(mOut);          print('\n');
-        print("gA   = "); print(gA);            print('\n');
-        print("gB   = "); print(gB);            print('\n');
-        print("gC   = "); print(gC);            print('\n');
-        print("sA   = "); print(sA.layout());   print('\n');
-        print("sB   = "); print(sB.layout());   print('\n');
-        print("sC   = "); print(sC.layout());   print('\n');
-        print("tAgA = "); print(tAgA.layout()); print('\n');
-        print("tBgB = "); print(tBgB.layout()); print('\n');
-        print("tAsA = "); print(tAsA.layout()); print('\n');
-        print("tBsB = "); print(tBsB.layout()); print('\n');
-        print("tCsA = "); print(tCsA.layout()); print('\n');
-        print("tCsB = "); print(tCsB.layout()); print('\n');
-        print("tCrC = "); print(tCrC.layout()); print('\n');
-        print("tCsC = "); print(tCsC.layout()); print('\n');
-        print("tDsC = "); print(tDsC.layout()); print('\n');
-        print("tDgC = "); print(tDgC.layout()); print('\n');
-        print("gmem tiled copy A = "); print(gmem_tiled_copy_A); print('\n');
-        print("gmem tiled copy B = "); print(gmem_tiled_copy_B); print('\n');
-        print("gmem tiled copy C = "); print(gmem_tiled_copy_C); print('\n');
-        print("k_tile_count = "); print(size<2>(gA)); print('\n');
-        print("k_tile_iter  = "); print(*k_tile_iter); print('\n');
-        print("K_BLOCK_MAX  = "); print(K_BLOCK_MAX); print('\n');
-    }
-    #endif
-  }
-};
-template <class TensorFlt, class TensorAct, class TensorOut>
-inline int
-fprop_reference(
-    TensorFlt mStencil,    // Logical MK: ( K,        (C,T,R,S))
-    TensorAct mActivation, // Logical NK: ((N,Z,P,Q), (C,T,R,S))
-    TensorOut mOutput,     // Logical MN: ( K,        (N,Z,P,Q))
-    TensorOut mOutputRef) {
-  int32_t N = size<1,0>(mOutputRef);
-  int32_t Z = size<1,1>(mOutputRef);
-  int32_t P = size<1,2>(mOutputRef);
-  int32_t Q = size<1,3>(mOutputRef);
-  int32_t T = size<1,3>(mStencil);
-  int32_t R = size<1,2>(mStencil);
-  int32_t S = size<1,1>(mStencil);
-  int32_t C = size<1,0>(mStencil);
-  size_t K    = static_cast<size_t>(size<0>(mOutputRef));
-  size_t NZPQ = static_cast<size_t>(size<1>(mOutputRef));
-  size_t CTRS = static_cast<size_t>(size<1>(mStencil));
-#if defined(_OPENMP)
-  #pragma omp parallel for
-#endif
-  for (size_t logical_m = 0; logical_m < K; ++logical_m) {
-    for (size_t logical_n = 0; logical_n < NZPQ; ++logical_n) {
-      auto accumulator = float(0);
-      for (size_t logical_k = 0; logical_k < CTRS; ++logical_k) {
-        accumulator += mStencil(logical_m, logical_k) * mActivation(logical_n, logical_k);
-      }
-      mOutputRef(logical_m, logical_n) = accumulator;
-    }
-  }
-  return print_relative_error(mOutput, mOutputRef,  /*print_verbose*/ false,  /*print_error*/ true, /*error_margin*/ 0.01);
-}

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/63_hopper_gemm_with_weight_prefetch/collective/builder.hpp DELETED Viewed

@@ -1,242 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "cutlass/gemm/collective/collective_builder.hpp"
-#include "dispatch_policy_extra.hpp"
-#include "sm90_mma_tma_gmma_ss_warpspecialized_with_prefetch.hpp"
-#include "../pipeline/prefetch_pipeline_sm90.hpp"
-namespace cutlass::gemm::collective {
-namespace detail {
-// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
-template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int stages>
-constexpr int
-compute_stage_count_or_override_prefetch(StageCount<stages> stage_count) {
-  return stages;
-}
-// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count.
-template<int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, int carveout_bytes>
-constexpr int
-compute_stage_count_or_override_prefetch(StageCountAutoCarveout<carveout_bytes> stage_count) {
-  constexpr auto mainloop_pipeline_bytes = sizeof(typename cutlass::PipelineTmaAsync<1>::SharedStorage);
-  constexpr auto prefetch_pipeline_bytes = sizeof(typename cutlass::detail::PrefetcherPipelineSharedStorage<PrefetchStages>);
-  constexpr auto a_bits = cute::sizeof_bits_v<ElementA>;
-  constexpr auto b_bits = cute::sizeof_bits_v<ElementB>;
-  constexpr int MK_bytes = cutlass::bits_to_bytes(a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})); //also the prefetch smem size
-  constexpr int NK_bytes = cutlass::bits_to_bytes(b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{}));
-  constexpr int stage_bytes = MK_bytes + NK_bytes + static_cast<int>(mainloop_pipeline_bytes);
-  return (CapacityBytes - carveout_bytes - MK_bytes * PrefetchStagesActual - prefetch_pipeline_bytes) / stage_bytes;
-}
-} // namespace detail
-// GMMA_TMA_WS_FP8_FAST_ACCUM_SS + prefetch
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<
-      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedFP8FastAccumWithPrefetch>>
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
-                "Not meet TMA alignment requirement yet\n");
-  static_assert(detail::is_input_fp8<ElementA, ElementB>(),
-                "Only FP8 datatypes are compatible with these kernel schedules\n");
-  // Dispatch TN fp8 kernels only to TMA warp specialized FP8 builder
-  static_assert(!detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>(),
-                 "Not supported for fp8 non-TN warp specialized kernels yet\n");
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
-  using AtomLayoutMNK = Layout<Shape<_1,_1,_1>>;
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
-      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
-  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
-  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
-  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
-      GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
-      GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override_prefetch<detail::sm90_smem_capacity_bytes,
-      ElementA, ElementB, TileShape_MNK>(StageCountType{});
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedWithPrefetch<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
-  using SmemCopyAtomA = void;
-  using SmemCopyAtomB = void;
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      TagToStrideA_t<GmemLayoutATag>,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-};
-// GMMA_TMA_WS_FP8_FAST_ACCUM_SS + prefetch and split DMA warps
-template <
-  class ElementA,
-  class GmemLayoutATag,
-  int AlignmentA,
-  class ElementB,
-  class GmemLayoutBTag,
-  int AlignmentB,
-  class ElementAccumulator,
-  class TileShape_MNK,
-  class ClusterShape_MNK,
-  class StageCountType,
-  class KernelScheduleType
->
-struct CollectiveBuilder<
-    arch::Sm90,
-    arch::OpClassTensorOp,
-    ElementA,
-    GmemLayoutATag,
-    AlignmentA,
-    ElementB,
-    GmemLayoutBTag,
-    AlignmentB,
-    ElementAccumulator,
-    TileShape_MNK,
-    ClusterShape_MNK,
-    StageCountType,
-    KernelScheduleType,
-    cute::enable_if_t<
-      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedFP8FastAccumWithPrefetchAndSplitDMA>>
-> {
-  static_assert(is_static<TileShape_MNK>::value);
-  static_assert(is_static<ClusterShape_MNK>::value);
-  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
-                "Not meet TMA alignment requirement yet\n");
-  static_assert(detail::is_input_fp8<ElementA, ElementB>(),
-                "Only FP8 datatypes are compatible with these kernel schedules\n");
-  // Dispatch TN fp8 kernels only to TMA warp specialized FP8 builder
-  static_assert(!detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>(),
-                 "Not supported for fp8 non-TN warp specialized kernels yet\n");
-#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
-  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
-#endif
-  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutATag>();
-  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutBTag>();
-  using AtomLayoutMNK = Layout<Shape<_1,_1,_1>>;
-  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
-      ElementA, ElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
-  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
-  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
-  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
-      GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
-      GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
-  static constexpr int PipelineStages = detail::compute_stage_count_or_override_prefetch<detail::sm90_smem_capacity_bytes,
-      ElementA, ElementB, TileShape_MNK>(StageCountType{});
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedWithPrefetch<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
-  using SmemCopyAtomA = void;
-  using SmemCopyAtomB = void;
-  using CollectiveOp = CollectiveMma<
-      DispatchPolicy,
-      TileShape_MNK,
-      ElementA,
-      TagToStrideA_t<GmemLayoutATag>,
-      ElementB,
-      TagToStrideB_t<GmemLayoutBTag>,
-      TiledMma,
-      GmemTiledCopyA,
-      SmemLayoutAtomA,
-      SmemCopyAtomA,
-      cute::identity,
-      GmemTiledCopyB,
-      SmemLayoutAtomB,
-      SmemCopyAtomB,
-      cute::identity
-    >;
-};
-} // namespace cutlass::gemm::collective
-/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/63_hopper_gemm_with_weight_prefetch/collective/dispatch_policy_extra.hpp DELETED Viewed

@@ -1,61 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-namespace cutlass::gemm {
-// Standard non-persistent kernel with a single producer warp, and one prefetch warp.
-// `A` is assumed to be static, and therefore the producer warp for `A` attempts to load `A`
-// while the producer warp is waiting on griddepcontrol.
-// GDC `launch_dependent_grids` is issued from the producer warp instead of math warps, and
-// according to prefetch ratio.
-struct KernelTmaWarpSpecializedFP8FastAccumWithPrefetch { };
-// Non-persistent kernel with two producer warps (one for each of A and B), and one prefetch warp.
-// `A` is assumed to be static, and therefore the producer warp for `A` attempts to load `A`
-// while the producer warp for `B` is waiting on griddepcontrol. Producer warp for `A` does not
-// wait on griddepcontrol and loads immediately.
-struct KernelTmaWarpSpecializedFP8FastAccumWithPrefetchAndSplitDMA { };
-template<
-  int Stages_,
-  class ClusterShape_ = Shape<_1,_1,_1>,
-  class KernelSchedule = KernelTmaWarpSpecializedFP8FastAccumWithPrefetch
->
-struct MainloopSm90TmaGmmaWarpSpecializedWithPrefetch {
-  constexpr static int Stages = Stages_;
-  using ClusterShape = ClusterShape_;
-  using ArchTag = arch::Sm90;
-  using Schedule = KernelSchedule;
-};
-} // namespace cutlass::gemm

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/63_hopper_gemm_with_weight_prefetch/collective/sm90_mma_tma_gmma_ss_warpspecialized_with_prefetch.hpp DELETED Viewed

@@ -1,871 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/numeric_types.h"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/trace.h"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cute/arch/copy_sm90.hpp"
-#include "cute/algorithm/functional.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cute/algorithm/gemm.hpp"
-#include "cute/numeric/arithmetic_tuple.hpp"
-#include "cutlass/arch/grid_dependency_control.h"
-#include "dispatch_policy_extra.hpp"
-#include "../pipeline/prefetch_pipeline_sm90.hpp"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cutlass::gemm::collective {
-using namespace cute;
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-constexpr int PrefetchStages = 4;
-constexpr int PrefetchInitialStages = 1;
-// This determines how much shmem we set aside for prefetch.
-// We don't reuse anything loaded by prefetcher, so we can keep
-// loading into the same place -- there will be a conflict when
-// writing, but it doesn't affect performance as much as the doors
-// that this opens.
-constexpr int PrefetchStagesActual = 1;
-} // namespace detail
-// WarpSpecialized Mainloop
-template <
-  int Stages,
-  class ClusterShape,
-  class KernelSchedule,
-  class TileShape_,
-  class ElementA_,
-  class StrideA_,
-  class ElementB_,
-  class StrideB_,
-  class TiledMma_,
-  class GmemTiledCopyA_,
-  class SmemLayoutAtomA_,
-  class SmemCopyAtomA_,
-  class TransformA_,
-  class GmemTiledCopyB_,
-  class SmemLayoutAtomB_,
-  class SmemCopyAtomB_,
-  class TransformB_>
-struct CollectiveMma<
-    MainloopSm90TmaGmmaWarpSpecializedWithPrefetch<Stages, ClusterShape, KernelSchedule>,
-    TileShape_,
-    ElementA_,
-    StrideA_,
-    ElementB_,
-    StrideB_,
-    TiledMma_,
-    GmemTiledCopyA_,
-    SmemLayoutAtomA_,
-    SmemCopyAtomA_,
-    TransformA_,
-    GmemTiledCopyB_,
-    SmemLayoutAtomB_,
-    SmemCopyAtomB_,
-    TransformB_>
-{
-  //
-  // Type Aliases
-  //
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedWithPrefetch<Stages, ClusterShape, KernelSchedule>;
-  using TileShape = TileShape_;
-  using ElementA = ElementA_;
-  using StrideA = StrideA_;
-  using ElementB = ElementB_;
-  using StrideB = StrideB_;
-  using TiledMma = TiledMma_;
-  using ElementAccumulator = typename TiledMma::ValTypeC;
-  using GmemTiledCopyA = GmemTiledCopyA_;
-  using GmemTiledCopyB = GmemTiledCopyB_;
-  using SmemLayoutAtomA = SmemLayoutAtomA_;
-  using SmemLayoutAtomB = SmemLayoutAtomB_;
-  using SmemCopyAtomA = SmemCopyAtomA_;
-  using SmemCopyAtomB = SmemCopyAtomB_;
-  using TransformA = TransformA_;
-  using TransformB = TransformB_;
-  using ArchTag = typename DispatchPolicy::ArchTag;
-  static_assert(size<1>(ClusterShape{}) == 1, "Cluster shape N must be 1");
-  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
-  using PrefetcherPipeline = cutlass::PrefetchPipeline<detail::PrefetchStages>;
-  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
-  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
-  using PipelineParams = typename MainloopPipeline::Params;
-  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
-  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
-  // Tile along modes in a way that maximizes the TMA box size.
-  using SmemLayoutA = decltype(tile_to_shape(
-      SmemLayoutAtomA{},
-      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  using SmemLayoutB = decltype(tile_to_shape(
-      SmemLayoutAtomB{},
-      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
-      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  static_assert(rank(SmemLayoutA{}) == 3 && size<2>(SmemLayoutA{}) == DispatchPolicy::Stages);
-  static_assert(rank(SmemLayoutB{}) == 3 && size<2>(SmemLayoutB{}) == DispatchPolicy::Stages);
-  using PrefetchSmemLayoutA = decltype(make_layout(make_shape(
-    cute::Int<size<0>(SmemLayoutA{})>{},
-    cute::Int<size<1>(SmemLayoutA{})>{},
-    cute::Int<detail::PrefetchStagesActual>{})));
-  static constexpr auto prefetch_smem_size = cute::cosize_v<PrefetchSmemLayoutA>;
-  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
-  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
-                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
-                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
-  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
-      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
-  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
-  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
-  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
-  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
-  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
-  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
-  // Defined outside the class where it's used, to work around MSVC issues
-  using PrefetcherPipelineStorage = ::cutlass::detail::PrefetcherPipelineSharedStorage<detail::PrefetchStages>;
-  struct SharedStorage {
-    struct TensorStorage : cute::aligned_struct<128, _0> {
-      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
-      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
-      cute::array_aligned<typename TiledMma::ValTypeA, prefetch_smem_size> smem_prefetch;
-    } tensors;
-    using PipelineStorage = typename MainloopPipeline::SharedStorage;
-    PipelineStorage pipeline;
-    PrefetcherPipelineStorage prefetcher_pipeline;
-  };
-  using TensorStorage = typename SharedStorage::TensorStorage;
-  using PipelineStorage = typename SharedStorage::PipelineStorage;
-  // Host side kernel arguments
-  struct Arguments {
-    ElementA const* ptr_A;
-    StrideA dA;
-    ElementB const* ptr_B;
-    StrideB dB;
-    uint32_t mma_promotion_interval = 4;
-    float overlap_ratio = 0.5;
-    float prefetch_ratio = -1.0;
-  };
-  // Device side kernel params
-  struct Params {
-    // Assumption: StrideA is congruent with Problem_MK
-    using TMA_A = decltype(make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{}));
-    // Assumption: StrideB is congruent with Problem_NK
-    using TMA_B = decltype(make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{}));
-    TMA_A tma_load_a;
-    TMA_B tma_load_b;
-    uint32_t tma_transaction_bytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
-    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
-    float overlap_ratio = 0.5;
-    float prefetch_ratio = -1.0;
-  };
-  //
-  // Methods
-  //
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    (void) workspace;
-    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
-    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
-    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
-    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
-    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
-        GmemTiledCopyA{},
-        tensor_a,
-        SmemLayoutA{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
-        GmemTiledCopyB{},
-        tensor_b,
-        SmemLayoutB{}(_,_,cute::Int<0>{}),
-        TileShape{},
-        ClusterShape{});
-    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
-    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
-    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
-    return {
-      tma_load_a,
-      tma_load_b,
-      transaction_bytes,
-      transaction_bytes_mk,
-      transaction_bytes_nk,
-      args.overlap_ratio,
-      args.prefetch_ratio
-    };
-  }
-  template<class ProblemShape>
-  static bool
-  can_implement(
-      ProblemShape const& problem_shape,
-      [[maybe_unused]] Arguments const& args) {
-    constexpr int tma_alignment_bits = 128;
-    auto problem_shape_MNKL = append<4>(problem_shape, 1);
-    auto [M,N,K,L] = problem_shape_MNKL;
-    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
-    bool implementable = cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
-    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
-    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
-      return false;
-    }
-    if (args.overlap_ratio > 1.0) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: `overlap_ratio` must be either negative (disabled) or in [0, 1].\n");
-      return false;
-    }
-    if (args.prefetch_ratio > 1.0) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: `prefetch_ratio` must be either negative (disabled) or in [0, 1].\n");
-      return false;
-    }
-    return true;
-  }
-  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
-  static constexpr int K_PIPE_MMAS = 1;
-  static constexpr uint32_t TmaTransactionBytesMK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
-  static constexpr uint32_t TmaTransactionBytesNK =
-        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& mainloop_params) {
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
-  }
-  /// Set up the data needed by this collective for load and mma.
-  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
-  /// Returned tuple must contain at least two elements, with the first two elements being:
-  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
-  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
-  /// The rest of the tensors can be specified as needed by this collective.
-  template <class ProblemShape_MNKL>
-  CUTLASS_DEVICE auto
-  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
-    using X = Underscore;
-    // Separate out problem shape for convenience
-    auto [M,N,K,L] = problem_shape_MNKL;
-    // TMA requires special handling of strides to deal with coord codomain mapping
-    // Represent the full tensors -- get these from TMA
-    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
-    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
-    // Make tiled views, defer the slice
-    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
-    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
-    return cute::make_tuple(gA_mkl, gB_nkl);
-  }
-  template <
-    class TensorA, class TensorB,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline,
-      PrefetcherPipeline prefetcher_pipeline,
-      PipelineState smem_pipe_write,
-      TensorA const& gA_mkl,
-      TensorB const& gB_nkl,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    int lane_predicate = cute::elect_one_sync();
-    if (lane_predicate) {
-      bool disable_gdc = mainloop_params.overlap_ratio < 0.0;
-      float overlap_ratio = mainloop_params.overlap_ratio;
-      int launch_dep_grids_threshold = static_cast<int>(static_cast<float>(k_tile_count - 1) * overlap_ratio);
-      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-      //
-      // Prepare the TMA loads for A
-      //
-      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
-      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-      auto cta_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
-      auto cta_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
-      // Partition the inputs based on the current block coordinates.
-      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
-      // Applies the mapping from cta_tma_a
-      Tensor tAgA = cta_tma_a.partition_S(gA);                                                   // (TMA,TMA_M,TMA_K,k)
-      Tensor tAsA = cta_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
-      // Applies the mapping from cta_tma_b
-      Tensor tBgB = cta_tma_b.partition_S(gB);                                                   // (TMA,TMA_N,TMA_K,k)
-      Tensor tBsB = cta_tma_b.partition_D(sB);                                                // (TMA,TMA_N,TMA_K,PIPE)
-      uint16_t mcast_mask_a = 0;
-      uint16_t mcast_mask_b = 0;
-      // Issue TmaLoads
-      // Maps the tile -> block, value
-      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
-        for (int n = 0; n < size<1>(block_layout); ++n) {
-          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-        }
-      }
-      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-        for (int m = 0; m < size<0>(block_layout); ++m) {
-          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
-        }
-      }
-      // We have to wait on dependent grids because of B.
-      cutlass::arch::wait_on_dependent_grids();
-      // Signal prefetcher to stop
-      prefetcher_pipeline.producer_arrive();
-      bool launch_dep_grids = false;
-      // Mainloop
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (int cnt=0 ; k_tile_count > 0; --k_tile_count, ++cnt) {
-        // LOCK smem_pipe_write for _writing_
-        pipeline.producer_acquire(smem_pipe_write);
-        //
-        // Copy gmem to smem for *k_tile_iter
-        //
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-        int write_stage = smem_pipe_write.index();
-        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a, cute::TMA::CacheHintSm90::EVICT_FIRST), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b, cute::TMA::CacheHintSm90::EVICT_LAST), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-        ++k_tile_iter;
-        if (!disable_gdc && cnt >= launch_dep_grids_threshold && !launch_dep_grids) {
-          launch_dep_grids = true;
-          cutlass::arch::launch_dependent_grids();
-        }
-        // Advance smem_pipe_write
-        ++smem_pipe_write;
-      }
-      if (!disable_gdc && !launch_dep_grids) {
-        cutlass::arch::launch_dependent_grids();
-      }
-    }
-  }
-  template <
-    class TensorA,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load_MK(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline,
-      PrefetcherPipeline prefetcher_pipeline,
-      PipelineState smem_pipe_write,
-      TensorA const& gA_mkl,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    int lane_predicate = cute::elect_one_sync();
-    if (lane_predicate) {
-      bool disable_gdc = mainloop_params.overlap_ratio < 0.0;
-      float overlap_ratio = mainloop_params.overlap_ratio;
-      int launch_dep_grids_threshold = static_cast<int>(static_cast<float>(k_tile_count - 1) * overlap_ratio);
-      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
-      //
-      // Prepare the TMA loads for A
-      //
-      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
-      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-      auto cta_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
-      // Partition the inputs based on the current block coordinates.
-      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-      // Applies the mapping from cta_tma_a
-      Tensor tAgA = cta_tma_a.partition_S(gA);                                                   // (TMA,TMA_M,TMA_K,k)
-      Tensor tAsA = cta_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
-      uint16_t mcast_mask_a = 0;
-      // Issue TmaLoads
-      // Maps the tile -> block, value
-      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
-        for (int n = 0; n < size<1>(block_layout); ++n) {
-          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-        }
-      }
-      // Don't wait on dependent grids when loading `A`, because
-      // we assume `A` (weights) are static.
-      bool launch_dep_grids = false;
-      // Mainloop
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (int cnt=0 ; k_tile_count > 0; --k_tile_count, ++cnt) {
-        // LOCK smem_pipe_write for _writing_
-        pipeline.producer_acquire(smem_pipe_write);
-        //
-        // Copy gmem to smem for *k_tile_iter
-        //
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-        int write_stage = smem_pipe_write.index();
-        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a, cute::TMA::CacheHintSm90::EVICT_FIRST), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        ++k_tile_iter;
-        if (!disable_gdc && cnt >= launch_dep_grids_threshold && !launch_dep_grids) {
-          launch_dep_grids = true;
-          cutlass::arch::launch_dependent_grids();
-        }
-        // Advance smem_pipe_write
-        ++smem_pipe_write;
-      }
-      if (!disable_gdc && !launch_dep_grids) {
-        cutlass::arch::launch_dependent_grids();
-      }
-    }
-  }
-  template <
-    class TensorB,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  load_NK(
-      Params const& mainloop_params,
-      MainloopPipeline pipeline,
-      PrefetcherPipeline prefetcher_pipeline,
-      PipelineState smem_pipe_write,
-      TensorB const& gB_nkl,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    int lane_predicate = cute::elect_one_sync();
-    if (lane_predicate) {
-      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
-      //
-      // Prepare the TMA loads for B
-      //
-      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
-      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-      auto cta_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
-      // Partition the inputs based on the current block coordinates.
-      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
-      // Applies the mapping from cta_tma_b
-      Tensor tBgB = cta_tma_b.partition_S(gB);                                                   // (TMA,TMA_N,TMA_K,k)
-      Tensor tBsB = cta_tma_b.partition_D(sB);                                                // (TMA,TMA_N,TMA_K,PIPE)
-      uint16_t mcast_mask_b = 0;
-      // Issue TmaLoads
-      // Maps the tile -> block, value
-      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-        for (int m = 0; m < size<0>(block_layout); ++m) {
-          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
-        }
-      }
-      // Ensure that the prefetched kernel does not touch
-      // unflushed global memory prior to this instruction
-      cutlass::arch::wait_on_dependent_grids();
-      // Signal prefetcher to stop
-      prefetcher_pipeline.producer_arrive();
-      // Mainloop
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (; k_tile_count > 0; --k_tile_count) {
-        // LOCK smem_pipe_write for _writing_
-        pipeline.producer_acquire(smem_pipe_write);
-        //
-        // Copy gmem to smem for *k_tile_iter
-        //
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-        int write_stage = smem_pipe_write.index();
-        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b, cute::TMA::CacheHintSm90::EVICT_LAST), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-        ++k_tile_iter;
-        // Advance smem_pipe_write
-        ++smem_pipe_write;
-      }
-    }
-  }
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void
-  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
-    int lane_predicate = cute::elect_one_sync();
-    // Issue the epilogue waits
-    if (lane_predicate) {
-      /* This helps avoid early exit of blocks in Cluster
-       * Waits for all stages to either be released (all
-       * Consumer UNLOCKs), or if the stage was never used
-       * then would just be acquired since the phase was
-       * still inverted from make_producer_start_state
-       */
-      pipeline.producer_tail(smem_pipe_write);
-    }
-  }
-  template <
-    class TensorA,
-    class KTileIterator, class BlockCoord
-  >
-  CUTLASS_DEVICE void
-  prefetch_MK(
-      Params const& mainloop_params,
-      PrefetcherPipeline prefetcher_pipeline,
-      PipelineState smem_pipe_write,
-      TensorA const& gA_mkl,
-      BlockCoord const& blk_coord,
-      KTileIterator k_tile_iter, int k_tile_count,
-      int thread_idx,
-      uint32_t block_rank_in_cluster,
-      TensorStorage& shared_tensors) {
-    int lane_predicate = cute::elect_one_sync();
-    if (lane_predicate) {
-      bool do_best_effort_prefetch = mainloop_params.prefetch_ratio < 0;
-      float prefetch_ratio = do_best_effort_prefetch ? 1.0 : mainloop_params.prefetch_ratio;
-      int prefetch_iters = static_cast<int>(static_cast<float>(k_tile_count) * 0.5 * prefetch_ratio);
-      prefetch_iters = min(k_tile_count, ((prefetch_iters + detail::PrefetchStages - 1) / detail::PrefetchStages) * detail::PrefetchStages);
-      Tensor sA = make_tensor(
-          make_smem_ptr(shared_tensors.smem_prefetch.data()), PrefetchSmemLayoutA{});             // (BLK_M,BLK_K,PIPE)
-      //
-      // Prepare the TMA loads for A
-      //
-      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
-      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
-      auto cta_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
-      // Partition the inputs based on the current block coordinates.
-      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
-      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
-      // Applies the mapping from cta_tma_a
-      Tensor tAgA = cta_tma_a.partition_S(gA);                                                   // (TMA,TMA_M,TMA_K,k)
-      Tensor tAsA = cta_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
-      uint16_t mcast_mask_a = 0;
-      // Issue TmaLoads
-      // Maps the tile -> block, value
-      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
-        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
-        for (int n = 0; n < size<1>(block_layout); ++n) {
-          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
-        }
-      }
-      uint32_t prefetcher_stage = 0;
-      uint32_t prefetcher_phase = 0;
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (int cnt = 0 ; cnt < prefetch_iters; ++cnt) {
-        if (do_best_effort_prefetch && prefetcher_pipeline.have_producers_arrived()) {
-          break;
-        }
-        prefetcher_pipeline.prefetcher_acquire(prefetcher_stage, prefetcher_phase, cnt >= detail::PrefetchStages);
-        using BarrierType = typename PrefetcherPipeline::PrefetcherBarrierType;
-        BarrierType* tma_barrier = prefetcher_pipeline.prefetcher_get_barrier(prefetcher_stage);
-        int write_stage = 0;
-        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a, cute::TMA::CacheHintSm90::EVICT_FIRST), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        ++k_tile_iter;
-        ++k_tile_iter;
-        prefetcher_pipeline.advance_prefetcher_state(prefetcher_stage, prefetcher_phase);
-      }
-      prefetcher_pipeline.prefetcher_tail(prefetcher_stage, prefetcher_phase);
-    }
-  }
-  /// Perform a collective-scoped matrix multiply-accumulate
-  /// Consumer Perspective
-  template <
-    class FrgTensorC
-  >
-  CUTLASS_DEVICE void
-  mma(MainloopPipeline pipeline,
-      PipelineState smem_pipe_read,
-      FrgTensorC& accum,
-      int k_tile_count,
-      int thread_idx,
-      TensorStorage& shared_tensors,
-      Params const& mainloop_params) {
-    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
-    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
-    static_assert(cute::is_void_v<SmemCopyAtomA>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-    static_assert(cute::is_void_v<SmemCopyAtomB>,
-      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
-    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
-    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
-    //
-    // Define C accumulators and A/B partitioning
-    //
-    TiledMma tiled_mma;
-    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
-    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
-    // Allocate "fragments/descriptors"
-    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
-    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
-    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
-    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
-    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
-    //
-    // PIPELINED MAIN LOOP
-    //
-    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
-        "ERROR : Incorrect number of MMAs in flight");
-    // We release buffers to producer warps(dma load) with some mmas in flight
-    PipelineState smem_pipe_release = smem_pipe_read;
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-    warpgroup_fence_operand(accum);
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-      int read_stage = smem_pipe_read.index();
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-      warpgroup_commit_batch();
-      ++smem_pipe_read;
-    }
-    warpgroup_fence_operand(accum);
-    // Mainloop GMMAs
-    k_tile_count -= prologue_mma_count;
-    CUTLASS_PRAGMA_NO_UNROLL
-    for ( ; k_tile_count > 0; --k_tile_count)
-    {
-      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-      //
-      // Compute on k_tile
-      //
-      int read_stage = smem_pipe_read.index();
-      warpgroup_fence_operand(accum);
-      warpgroup_arrive();
-      // Unroll the K mode manually to set scale D to 1
-      CUTLASS_PRAGMA_UNROLL
-      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-        // (V,M,K) x (V,N,K) => (V,M,N)
-        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
-        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-      }
-      warpgroup_commit_batch();
-      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
-      warpgroup_wait<K_PIPE_MMAS>();
-      warpgroup_fence_operand(accum);
-      // UNLOCK smem_pipe_release, done _computing_ on it
-      pipeline.consumer_release(smem_pipe_release);
-      // Advance smem_pipe_read and smem_pipe_release
-      ++smem_pipe_read;
-      ++smem_pipe_release;
-    }
-    warpgroup_fence_operand(accum);
-  }
-  /// Perform a Consumer Epilogue to release all buffers
-  CUTLASS_DEVICE void
-  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
-    // Prologue GMMAs
-    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
-    k_tile_count -= prologue_mma_count;
-    smem_pipe_release.advance(k_tile_count);
-    // Wait on all GMMAs to complete
-    warpgroup_wait<0>();
-    for (int count = 0; count < prologue_mma_count; ++count) {
-      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
-      ++smem_pipe_release;
-    }
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace cutlass::gemm::collective
-/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/63_hopper_gemm_with_weight_prefetch/gemm_with_weight_prefetch_commandline.hpp DELETED Viewed

@@ -1,117 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-// Command line options parsing
-struct Options {
-  bool help = false;
-  float alpha = 1.f, beta = 0.f;
-  float overlap_ratio = 0.5f, prefetch_ratio = 0.5f;
-  int iterations = 1000;
-  int n = 64, m = 1280, k = 8192, l = 1;
-  // Parses the command line
-  void parse(int argc, char const **args) {
-    cutlass::CommandLine cmd(argc, args);
-    if (cmd.check_cmd_line_flag("help")) {
-      help = true;
-      return;
-    }
-    cmd.get_cmd_line_argument("m", m);
-    cmd.get_cmd_line_argument("n", n);
-    cmd.get_cmd_line_argument("k", k);
-    cmd.get_cmd_line_argument("l", l);
-    cmd.get_cmd_line_argument("alpha", alpha, 1.f);
-    cmd.get_cmd_line_argument("beta", beta, 0.f);
-    cmd.get_cmd_line_argument("p", prefetch_ratio, 0.5f);
-    cmd.get_cmd_line_argument("o", overlap_ratio, 0.5f);
-    cmd.get_cmd_line_argument("iterations", iterations);
-  }
-  /// Prints the usage statement.
-  std::ostream & print_usage(std::ostream &out) const {
-    out << "63_hopper_gemm_with_weight_prefetch\n\n"
-      << "  Hopper FP8 GEMM using a non-persistent kernel with L2 weight prefetch. \n"
-      << "  For more details please refer to the source file.\n\n"
-      << "Options:\n\n"
-      << "  --help                      If specified, displays this usage statement\n\n"
-      << "  --m=<int>                   Sets the M extent of the GEMM\n"
-      << "  --n=<int>                   Sets the N extent of the GEMM\n"
-      << "  --k=<int>                   Sets the K extent of the GEMM\n"
-      << "  --l=<int>                   Sets the l extent (batch) of the GEMM\n"
-      << "  --alpha=<f32>               Epilogue scalar alpha\n"
-      << "  --beta=<f32>                Epilogue scalar beta\n"
-      << "  --p=<f32>                   Prefetch ratio\n"
-      << "  --o=<f32>                   Overlap ratio\n"
-      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
-    out
-      << "\n\nExamples:\n\n"
-      << "$ " << "63_hopper_gemm_with_weight_prefetch" <<
-      " --m=1024 --n=512 --k=1024 --o=0.5 --p=0.5 \n\n";
-    return out;
-  }
-  /// Compute performance in GFLOP/s
-  double gflops(double runtime_s) const
-  {
-    // Two flops per multiply-add
-    uint64_t flop = uint64_t(2) * m * n * k * l;
-    double gflop = double(flop) / double(1.0e9);
-    return gflop / runtime_s;
-  }
-  /// Compute effective bandwidth in GB/sec
-  double effective_bandwidth(
-    double runtime_s,
-    size_t bytes_a,
-    size_t bytes_b,
-    size_t bytes_c,
-    size_t bytes_d
-  ) const
-  {
-    static double const kBytesPerGiB = double(1ull << 30);
-    double bytes_in =
-      (double)(l) * (double)(m) * (double)(k) * (double)(bytes_a) +                        // A
-      (double)(l) * (double)(n) * (double)(k) * (double)(bytes_b) +                        // B
-      (beta != 0.f ? (double)(l) * (double)(m) * (double)(n) * (double)(bytes_c) : 0.f);   // C
-    double bytes_out = (double)(l) * (double)(m) * (double)(n) * (double)(bytes_d);        // D
-    double gb_total = (bytes_in + bytes_out) / kBytesPerGiB;
-    return gb_total / runtime_s;
-  }
-};

build/torch211-cxx11-cu130-aarch64-linux/include/third-party/cutlass/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp DELETED Viewed

@@ -1,561 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/kernel_hardware_info.hpp"
-#include "cute/arch/cluster_sm90.hpp"
-#include "cutlass/arch/reg_reconfig.h"
-#include "cutlass/arch/mma_sm90.h"
-#include "cutlass/epilogue/collective/detail.hpp"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
-#include "cutlass/pipeline/pipeline.hpp"
-#include "cutlass/trace.h"
-#include "cute/tensor.hpp"
-#include "../collective/dispatch_policy_extra.hpp"
-///////////////////////////////////////////////////////////////////////////////
-namespace cutlass::gemm::kernel {
-///////////////////////////////////////////////////////////////////////////////
-// GEMM + Prefetch for the A tensor + (optional) split DMA warps
-template <
-  class ProblemShape_,
-  class CollectiveMainloop_,
-  class CollectiveEpilogue_,
-  class TileScheduler_
->
-class GemmUniversal<
-  ProblemShape_,
-  CollectiveMainloop_,
-  CollectiveEpilogue_,
-  TileScheduler_,
-  cute::enable_if_t<
-    cute::is_same_v<typename CollectiveMainloop_::DispatchPolicy::Schedule, KernelTmaWarpSpecializedFP8FastAccumWithPrefetchAndSplitDMA> ||
-    cute::is_same_v<typename CollectiveMainloop_::DispatchPolicy::Schedule, KernelTmaWarpSpecializedFP8FastAccumWithPrefetch>
-    >
->
-{
-public:
-  //
-  // Type Aliases
-  //
-  using ProblemShape = ProblemShape_;
-  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
-    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
-  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
-  static constexpr bool SplitWarps = cute::is_same_v<typename CollectiveMainloop_::DispatchPolicy::Schedule, KernelTmaWarpSpecializedFP8FastAccumWithPrefetchAndSplitDMA>;
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape = typename CollectiveMainloop::TileShape;
-  using TiledMma  = typename CollectiveMainloop::TiledMma;
-  using ArchTag   = typename CollectiveMainloop::ArchTag;
-  using ElementA  = typename CollectiveMainloop::ElementA;
-  using StrideA   = typename CollectiveMainloop::StrideA;
-  using ElementB  = typename CollectiveMainloop::ElementB;
-  using StrideB   = typename CollectiveMainloop::StrideB;
-  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
-  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
-  using ClusterShape = typename DispatchPolicy::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using ElementC = typename CollectiveEpilogue::ElementC;
-  using StrideC  = typename CollectiveEpilogue::StrideC;
-  using ElementD = typename CollectiveEpilogue::ElementD;
-  using StrideD  = typename CollectiveEpilogue::StrideD;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
-    "TMA warp-specialized kernel does not support specializing the tile scheduler.");
-  using TileSchedulerTag = TileScheduler_;
-  using TileScheduler = typename detail::TileSchedulerSelector<
-    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
-  using TileSchedulerArguments = typename TileScheduler::Arguments;
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    // Mainloop and epilogue don't use smem concurrently since kernel is non-persistent, so we can use a union
-    union TensorStorage {
-      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
-      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
-      MainloopTensorStorage mainloop;
-      EpilogueTensorStorage epilogue;
-    } tensors;
-    struct PipelineStorage : cute::aligned_struct<16, _1> {
-      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
-      using PrefetcherPipelineStorage = typename CollectiveMainloop::PrefetcherPipelineStorage;
-      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
-      alignas(16) MainloopPipelineStorage mainloop;
-      alignas(16) EpiLoadPipelineStorage epi_load;
-      alignas(16) PrefetcherPipelineStorage prefetcher;
-    } pipelines;
-  };
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-  static constexpr uint32_t NumLoadWarpGroups = 1;
-  static constexpr uint32_t NumMmaWarpGroups = 1;
-  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-  // Device side arguments
-  struct Arguments {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-  // Kernel entry point API
-  struct Params {
-    GemmUniversalMode mode{};
-    ProblemShape problem_shape{};
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-  };
-  //
-  // Methods
-  //
-  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
-  static
-  Params
-  to_underlying_arguments(Arguments const& args, void* workspace) {
-    (void) workspace;
-    auto problem_shape = args.problem_shape;
-    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
-      // swap M/N
-      get<0>(problem_shape) = get<1>(args.problem_shape);
-      get<1>(problem_shape) = get<0>(args.problem_shape);
-    }
-    return {
-      args.mode,
-      problem_shape,
-      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
-      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
-    };
-  }
-  static bool
-  can_implement(Arguments const& args) {
-    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
-        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
-    if (!implementable) {
-      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
-      return implementable;
-    }
-    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
-    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
-    implementable &= TileScheduler::can_implement(args.scheduler);
-    return implementable;
-  }
-  static
-  size_t
-  get_workspace_size(Arguments const& args) {
-    return 0;
-  }
-  static
-  cutlass::Status
-  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return Status::kSuccess;
-  }
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3
-  get_grid_shape(Params const& params) {
-    auto cluster_shape = ClusterShape{};
-    auto tile_shape = TileShape{};
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    return TileScheduler::get_tiled_cta_shape_mnl(
-        problem_shape_MNKL, tile_shape, cluster_shape);
-  }
-  static dim3
-  get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-  CUTLASS_DEVICE
-  void
-  operator()(Params const& params, char* smem_buf) {
-    using namespace cute;
-    using X = Underscore;
-#if defined(__CUDA_ARCH_FEAT_SM90_ALL)
-#  define ENABLE_SM90_KERNEL_LEVEL 1
-#endif
-// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
-#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
-    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
-#else
-    enum class WarpGroupRole {
-      Producer = 0,
-      Consumer = 1,
-    };
-    // Split mode: use Warp0 to load NK and epilogue, Warp2 to load MK.
-    // Non-split mode: use Warp0 to load MK, NK and epilogue, Warp2 is unused.
-    // Both modes use Warp1 to prefetch.
-    enum class ProducerWarpRole {
-      Warp0 = 0,
-      PrefetchMK = 1,
-      Warp2 = 2,
-      UnusedWarp = 3
-    };
-    // Kernel level shared memory storage
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-    int thread_idx = int(threadIdx.x);
-    int lane_idx = canonical_lane_idx();
-    int warp_idx = canonical_warp_idx_sync();
-    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
-    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
-    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
-    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
-    int lane_predicate = cute::elect_one_sync();
-    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
-    // Issue Tma Descriptor Prefetch from a single thread
-    if ((warp_idx == 0) && lane_predicate) {
-      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
-      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
-    }
-    // Mainloop Load pipeline
-    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
-    typename MainloopPipeline::Params mainloop_pipeline_params;
-    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
-    if (warp_group_role == WarpGroupRole::Producer && (
-          producer_warp_role == ProducerWarpRole::Warp0 ||
-          producer_warp_role == ProducerWarpRole::Warp2)) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
-      mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer) {
-      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
-    }
-    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
-    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
-    bool should_prefetch = params.mainloop.prefetch_ratio > 0;
-    using PrefetcherPipeline = typename CollectiveMainloop::PrefetcherPipeline;
-    typename PrefetcherPipeline::Params prefetcher_pipeline_params;
-    prefetcher_pipeline_params.num_prefetchers = 1;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::PrefetchMK) {
-      prefetcher_pipeline_params.should_prefetch = should_prefetch;
-      prefetcher_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes_mk;
-    }
-    PrefetcherPipeline prefetcher_pipeline(shared_storage.pipelines.prefetcher, prefetcher_pipeline_params);
-    // Epilogue Load pipeline
-    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
-    typename EpiLoadPipeline::Params epi_load_pipeline_params;
-    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Warp0) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
-    }
-    if (warp_group_role == WarpGroupRole::Consumer) {
-      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
-    }
-    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
-    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
-    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
-    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
-      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
-    }
-    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
-    // Epilogue Store pipeline
-    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
-    typename EpiStorePipeline::Params epi_store_pipeline_params;
-    epi_store_pipeline_params.always_wait = true;
-    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
-    // Initialize starting pipeline states for the collectives
-    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
-    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
-    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
-    // For the DMA Load (producer) we start with an opposite phase
-    // i.e., we skip all waits since we know that the buffer is indeed empty
-    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
-    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
-    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
-    auto cluster_wait_fn = [&] () {
-      // We need this to guarantee that the Pipeline init is visible
-      // To all producers and consumer thread blocks in the Cluster
-      if constexpr (size(ClusterShape{}) > 1) {
-        // Non-prefetcher warps arrive and wait,
-        // Prefetcher warp can go ahead without waiting.
-        cute::cluster_arrive_relaxed();
-        if (warp_group_role != WarpGroupRole::Producer ||
-            producer_warp_role != ProducerWarpRole::PrefetchMK) {
-          cute::cluster_wait();
-        }
-        return [] () {};
-      }
-      else {
-        // __syncthreads() but only for non prefetcher warps
-        if (should_prefetch) {
-          // Use a named barrier to let the prefetcher warp start loading into the L2
-          // without waiting to sync with all other warps.
-          // All other warps need to sync because the mainloop pipeline init
-          // should be visible to all of them.
-          // Prefetcher has its own barriers, and the only warps it would need to sync
-          // with would be the DMA warps.
-          using ClusterSyncWithPrefetchBarrier = typename cutlass::arch::NamedBarrier;
-          auto prefetcher_arrive_barrier = ClusterSyncWithPrefetchBarrier(
-              blockDim.x * blockDim.y * blockDim.z,
-              /*id*/ 0);
-          // Prefetcher warp doesn't arrive on this barrier.
-          auto cluster_arrive_barrier = ClusterSyncWithPrefetchBarrier(
-              blockDim.x * blockDim.y * blockDim.z - NumThreadsPerWarp,
-              /*id*/ 1);
-          if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::PrefetchMK) {
-            __syncwarp();
-            prefetcher_arrive_barrier.arrive();
-          }
-          else if (warp_group_role == WarpGroupRole::Producer) {
-            prefetcher_arrive_barrier.arrive_and_wait();
-            cluster_arrive_barrier.arrive_and_wait();
-          }
-          else {
-            prefetcher_arrive_barrier.arrive();
-            cluster_arrive_barrier.arrive_and_wait();
-          }
-        } else {
-        __syncthreads();
-        }
-        return [] () {};
-      }
-    } ();
-    // Preconditions
-    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
-    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
-    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
-    // Get the appropriate blocks for this thread block -- potential for thread block locality
-    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
-    TiledMma tiled_mma;
-    // In a warp specialized kernel, collectives expose data movement and compute operations separately
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
-    // Prepare and partition the input tensors. Expects a tuple of tensors where:
-    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
-    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
-    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
-    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
-    // Extract out partitioned A and B.
-    Tensor gA_mkl = get<0>(load_inputs);
-    Tensor gB_nkl = get<1>(load_inputs);
-    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
-    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
-    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
-    auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl));
-    auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
-    // Get pipeline iterators and increments from tensor shapes
-    auto k_tile_iter  = cute::make_coord_iterator(shape<3>(gA_mkl));
-    auto k_tile_count = size<3>(gA_mkl);
-    // Wait for all thread blocks in the Cluster
-    cluster_wait_fn();
-    if (warp_group_role == WarpGroupRole::Producer) {
-      if (producer_warp_role == ProducerWarpRole::Warp0) {
-        if constexpr(SplitWarps) {
-          collective_mainloop.load_NK(
-            params.mainloop,
-            mainloop_pipeline,
-            prefetcher_pipeline,
-            mainloop_pipe_producer_state,
-            gB_nkl,
-            blk_coord,
-            k_tile_iter, k_tile_count,
-            lane_idx,
-            block_rank_in_cluster,
-            shared_storage.tensors.mainloop
-          );
-        }
-        else {
-          collective_mainloop.load(
-            params.mainloop,
-            mainloop_pipeline,
-            prefetcher_pipeline,
-            mainloop_pipe_producer_state,
-            gA_mkl, gB_nkl,
-            blk_coord,
-            k_tile_iter, k_tile_count,
-            lane_idx,
-            block_rank_in_cluster,
-            shared_storage.tensors.mainloop
-          );
-        }
-        // Update starting mainloop pipeline state for the pipeline drain
-        mainloop_pipe_producer_state.advance(k_tile_count);
-        // Make sure mainloop consumer has been waited upon before issuing epilogue load
-        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
-        if (collective_epilogue.is_producer_load_needed()) {
-          // Ensure warp is converged before issuing epilogue loads
-          __syncwarp();
-          epi_load_pipe_producer_state = collective_epilogue.load(
-            epi_load_pipeline,
-            epi_load_pipe_producer_state,
-            problem_shape_MNKL,
-            blk_shape,
-            blk_coord,
-            tiled_mma,
-            lane_idx,
-            shared_storage.tensors.epilogue
-          );
-          collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
-        }
-      }
-      else if (SplitWarps && producer_warp_role == ProducerWarpRole::Warp2) {
-        collective_mainloop.load_MK(
-          params.mainloop,
-          mainloop_pipeline,
-          prefetcher_pipeline,
-          mainloop_pipe_producer_state,
-          gA_mkl,
-          blk_coord,
-          k_tile_iter, k_tile_count,
-          lane_idx,
-          block_rank_in_cluster,
-          shared_storage.tensors.mainloop
-        );
-        // Update starting mainloop pipeline state for the pipeline drain
-        mainloop_pipe_producer_state.advance(k_tile_count);
-        // Make sure mainloop consumer has been waited upon before issuing epilogue load
-        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
-      } else if (producer_warp_role == ProducerWarpRole::PrefetchMK && should_prefetch) {
-        collective_mainloop.prefetch_MK(
-          params.mainloop,
-          prefetcher_pipeline,
-          mainloop_pipe_producer_state,
-          gA_mkl,
-          blk_coord,
-          k_tile_iter, k_tile_count,
-          lane_idx,
-          block_rank_in_cluster,
-          shared_storage.tensors.mainloop
-        );
-      }
-    }
-    else if (warp_group_role == WarpGroupRole::Consumer) {
-      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
-      collective_mainloop.mma(
-        mainloop_pipeline,
-        mainloop_pipe_consumer_state,
-        accumulators,
-        k_tile_count,
-        warp_group_thread_idx,
-        shared_storage.tensors.mainloop,
-        params.mainloop
-      );
-      // Make sure the math instructions are done and free buffers before entering the epilogue
-      collective_mainloop.mma_tail(
-        mainloop_pipeline,
-        mainloop_pipe_consumer_state,
-        k_tile_count
-      );
-      // Epilogue and write to gD
-      auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
-      collective_epilogue.store(
-        epi_load_pipeline,
-        epi_load_pipe_consumer_state,
-        epi_store_pipeline,
-        epi_store_pipe_producer_state,
-        problem_shape_MNKL,
-        blk_shape,
-        blk_coord,
-        accumulators,
-        tiled_mma,
-        warp_group_thread_idx,
-        shared_storage.tensors.epilogue
-      );
-      collective_epilogue.store_tail(
-        epi_load_pipeline,
-        epi_load_pipe_consumer_state_next,
-        epi_store_pipeline,
-        epi_store_pipe_producer_state_next
-      );
-    }
-#endif
-  }
-};
-///////////////////////////////////////////////////////////////////////////////
-} // namespace cutlass::gemm::kernel