Kernels:

kernels-community
/

deep-gemm

Trusted publisher

Kernel card Files Files and versions

xet

Community

kernels-bot commited on 6 days ago

Commit

d8e8ea2

verified ·

1 Parent(s): a7f41ae

Uploaded using `kernel-builder` (batch 8/32).

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h +0 -222
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/GPU_Clock.hpp +0 -67
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/command_line.h +0 -324
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/cublas_wrappers.hpp +0 -528
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/debug.h +0 -143
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_dump.h +0 -187
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_groupnorm.h +0 -402
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_layernorm.h +0 -644
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_memory.h +0 -375
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nchw_to_nhwc.h +0 -141
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_padding.h +0 -276
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_pooling.h +0 -573
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_to_nchw.h +0 -144
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_rmsnorm.h +0 -186
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_utils.h +0 -127
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/distribution.h +0 -157
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/exceptions.h +0 -69
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/gett_commandline.hpp +0 -369
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/helper_cuda.hpp +0 -116
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_reorder.h +0 -111
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_tensor.h +0 -541
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_tensor_planar_complex.h +0 -591
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_uncompress.h +0 -157
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/index_sequence.h +0 -38
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/mixed_dtype_utils.hpp +0 -472
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/packed_stride.hpp +0 -570
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/print_error.hpp +0 -341
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/detail/inner_product.h +0 -135
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h +0 -94
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/convolution.h +0 -1549
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gemm.h +0 -385
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gemm_complex.h +0 -350
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h +0 -311
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gett.hpp +0 -146
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/kernel/gemm.h +0 -162
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h +0 -168
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h +0 -159
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h +0 -355
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_compare.h +0 -250
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_fill.h +0 -2075
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_foreach.h +0 -142
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_reduce.h +0 -514
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_relu.h +0 -141
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/thread/gemm.h +0 -186
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/conv.hpp +0 -782
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/convolution.h +0 -802
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/error_metrics.h +0 -66
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gemm.h +0 -531
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gemm_complex.h +0 -210
build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h +0 -228

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h DELETED Viewed

@@ -1,222 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-   \brief Defines a math function
-*/
-#pragma once
-#include <vector>
-#include <string>
-#include <memory>
-#include <algorithm>
-#include <unordered_map>
-// CUTLASS Library includes
-#include "cutlass/blas3.h"
-#include "cutlass/library/library.h"
-#include "cutlass/library/util.h"
-#include "cutlass/library/manifest.h"
-// Profiler includes
-#include "options.h"
-#include "device_context.h"
-#include "operation_profiler.h"
-#include "performance_result.h"
-#include "problem_space.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace profiler {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Abstract base class for each math function
-class TrmmOperationProfiler : public OperationProfiler {
-public:
-  /// Problem structure obtained from problem space
-  struct TrmmProblem {
-    int64_t m;
-    int64_t n;
-    int64_t lda;
-    int64_t ldb;
-    int64_t ldd;
-    SideMode side_mode;
-    FillMode fill_mode;
-    DiagType diag_type;
-    std::vector<uint8_t> alpha;
-    std::vector<uint8_t> beta;
-    int64_t split_k_slices;
-    int64_t batch_count;
-    //
-    // Methods
-    //
-    TrmmProblem():
-      m(16), n(16), lda(0), ldb(0),  ldd(0), split_k_slices(1), batch_count(1) { }
-    /// Parses the problem
-    Status parse(
-      library::TrmmDescription const &operation_desc,
-      ProblemSpace const &problem_space,
-      ProblemSpace::Problem const &problem);
-    /// Initializes a performance result
-    void initialize_result(
-      PerformanceResult &result,
-      library::TrmmDescription const &operation_desc,
-      ProblemSpace const &problem_space);
-  };
-  /// Workspace used
-  struct TrmmWorkspace {
-    DeviceAllocation *A;
-    DeviceAllocation *B;
-    DeviceAllocation *D;
-    DeviceAllocation *Computed;
-    DeviceAllocation *Reference;
-    library::TrmmConfiguration configuration;
-    library::TrmmArguments arguments;
-    /// Buffer used for the operation's host workspace
-    std::vector<uint8_t> host_workspace;
-    /// Buffer used for the operations' device workspace
-    DeviceAllocation device_workspace;
-    //
-    // Methods
-    //
-    TrmmWorkspace():
-      A(nullptr), B(nullptr), D(nullptr), Computed(nullptr), Reference(nullptr) { }
-  };
-protected:
-  //
-  // Data members
-  //
-  /// GEMM problem obtained from problem space
-  TrmmProblem problem_;
-  /// Device memory allocations
-  TrmmWorkspace trmm_workspace_;
-public:
-  //
-  // Methods
-  //
-  /// Ctor
-  TrmmOperationProfiler(Options const &options);
-  /// Destructor
-  virtual ~TrmmOperationProfiler();
-  /// Prints usage statement for the math function
-  virtual void print_usage(std::ostream &out) const;
-  /// Prints examples
-  virtual void print_examples(std::ostream &out) const;
-  /// Extracts the problem dimensions
-  virtual Status initialize_configuration(
-    Options const &options,
-    PerformanceReport &report,
-    DeviceContext &device_context,
-    library::Operation const *operation,
-    ProblemSpace const &problem_space,
-    ProblemSpace::Problem const &problem);
-  /// Initializes workspace
-  virtual Status initialize_workspace(
-    Options const &options,
-    PerformanceReport &report,
-    DeviceContext &device_context,
-    library::Operation const *operation,
-    ProblemSpace const &problem_space,
-    ProblemSpace::Problem const &problem);
-  /// Verifies CUTLASS against references
-  virtual bool verify_cutlass(
-    Options const &options,
-    PerformanceReport &report,
-    DeviceContext &device_context,
-    library::Operation const *operation,
-    ProblemSpace const &problem_space,
-    ProblemSpace::Problem const &problem);
-  /// Measures performance results
-  virtual bool profile(
-    Options const &options,
-    PerformanceReport &report,
-    DeviceContext &device_context,
-    library::Operation const *operation,
-    ProblemSpace const &problem_space,
-    ProblemSpace::Problem const &problem);
-protected:
-  /// Initializes the performance result
-  void initialize_result_(
-    PerformanceResult &result,
-    Options const &options,
-    library::TrmmDescription const &operation_desc,
-    ProblemSpace const &problem_space);
-  /// Verifies CUTLASS against references
-  bool verify_with_cublas_(
-    Options const &options,
-    PerformanceReport &report,
-    DeviceContext &device_context,
-    library::Operation const *operation,
-    ProblemSpace const &problem_space,
-    ProblemSpace::Problem const &problem);
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace profiler
-} // namespace cutlass
-/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/GPU_Clock.hpp DELETED Viewed

@@ -1,67 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include <cuda_runtime.h>
-struct GPU_Clock
-{
-  GPU_Clock() {
-    cudaEventCreate(&start_);
-    cudaEventCreate(&stop_);
-    cudaEventRecord(start_);
-  }
-  ~GPU_Clock() {
-    cudaEventDestroy(start_);
-    cudaEventDestroy(stop_);
-  }
-  void start() {
-    cudaEventRecord(start_);
-  }
-  float milliseconds() {
-    cudaEventRecord(stop_);
-    cudaEventSynchronize(stop_);
-    float time;
-    cudaEventElapsedTime(&time, start_, stop_);
-    return time;
-  }
-  float seconds() {
-    return milliseconds() * float(1e-3);
-  }
- private:
-  cudaEvent_t start_, stop_;
-};

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/command_line.h DELETED Viewed

@@ -1,324 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-/**
- * \file
- * Utility for parsing command line arguments
- */
-#include <iostream>
-#include <limits>
-#include <sstream>
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include <cuda_runtime.h>
-#include "cutlass/cutlass.h"
-namespace cutlass {
-/******************************************************************************
- * command_line
- ******************************************************************************/
-/**
- * Utility for parsing command line arguments
- */
-struct CommandLine {
-  std::vector<std::string> keys;
-  std::vector<std::string> values;
-  std::vector<std::string> args;
-  /**
-   * Constructor
-   */
-  CommandLine(int argc, const char** argv) {
-    using namespace std;
-    for (int i = 1; i < argc; i++) {
-      string arg = argv[i];
-      if ((arg[0] != '-') || (arg[1] != '-')) {
-        args.push_back(arg);
-        continue;
-      }
-      string::size_type pos;
-      string key, val;
-      if ((pos = arg.find('=')) == string::npos) {
-        key = string(arg, 2, arg.length() - 2);
-        val = "";
-      } else {
-        key = string(arg, 2, pos - 2);
-        val = string(arg, pos + 1, arg.length() - 1);
-      }
-      keys.push_back(key);
-      values.push_back(val);
-    }
-  }
-  /**
-   * Constructor to represent a command line from a map of [argument] -> [value]
-   */
-  CommandLine(std::unordered_map<std::string, std::string>& arg_map) {
-    for (const auto& [key, value] : arg_map) {
-      keys.push_back(key);
-      values.push_back(value);
-    }
-  }
-  /**
-   * Checks whether a flag "--<flag>" is present in the commandline
-   */
-  bool check_cmd_line_flag(const char* arg_name) const {
-    using namespace std;
-    for (int i = 0; i < int(keys.size()); ++i) {
-      if (keys[i] == string(arg_name)) return true;
-    }
-    return false;
-  }
-  /**
-   * Returns number of naked (non-flag and non-key-value) commandline parameters
-   */
-  size_t num_naked_args() const {
-    return args.size();
-  }
-  /**
-   * Print naked (non-flag and non-key-value) commandline parameters
-   */
-  void print_naked_args(std::ostream &out) const {
-    for (auto arg : args) {
-      out << "   " << arg <<"\n";
-    }
-  }
-  /**
-   * Returns the commandline parameter for a given index (not including flags)
-   */
-  template <typename value_t>
-  void get_cmd_line_argument(size_t index, value_t& val) const {
-    using namespace std;
-    if (index < args.size()) {
-      istringstream str_stream(args[index]);
-      str_stream >> val;
-    }
-  }
-  /**
-   * Obtains the boolean value specified for a given commandline parameter --<flag>=<bool>
-   */
-  void get_cmd_line_argument(const char* arg_name, bool& val, bool _default) const {
-    val = _default;
-    if (check_cmd_line_flag(arg_name)) {
-      std::string value;
-      get_cmd_line_argument(arg_name, value);
-      val = !(value == "0" || value == "false");
-    }
-  }
-  /**
-   * Obtains the value specified for a given commandline parameter --<flag>=<value>
-   */
-  template <typename value_t>
-  void get_cmd_line_argument(const char* arg_name,
-                             value_t& val) const {
-    get_cmd_line_argument(arg_name, val, val);
-  }
-  /**
-   * Obtains the value specified for a given commandline parameter --<flag>=<value>
-   */
-  template <typename value_t>
-  void get_cmd_line_argument(const char* arg_name,
-                             value_t& val,
-                             value_t const& _default) const {
-    using namespace std;
-    val = _default;
-    for (int i = 0; i < int(keys.size()); ++i) {
-      if (keys[i] == string(arg_name)) {
-        istringstream str_stream(values[i]);
-        str_stream >> val;
-      }
-    }
-  }
-  /**
-   * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
-   */
-  template <typename value_t>
-  void get_cmd_line_arguments(const char* arg_name,
-                              std::vector<value_t>& vals,
-                              char sep = ',') const {
-    using namespace std;
-    if (check_cmd_line_flag(arg_name)) {
-      // Clear any default values
-      vals.clear();
-      // Recover from multi-value string
-      for (size_t i = 0; i < keys.size(); ++i) {
-        if (keys[i] == string(arg_name)) {
-          string val_string(values[i]);
-          separate_string(val_string, vals, sep);
-        }
-      }
-    }
-  }
-  /**
-   * Returns the values specified for a given commandline parameter
-   * --<flag>=<value>,<value_start:value_end>*
-   */
-  void get_cmd_line_argument_pairs(const char* arg_name,
-                                   std::vector<std::pair<std::string, std::string> >& tokens,
-                                   char delim = ',',
-                                   char sep = ':') const {
-    if (check_cmd_line_flag(arg_name)) {
-      std::string value;
-      get_cmd_line_argument(arg_name, value);
-      tokenize(tokens, value, delim, sep);
-    }
-  }
-  /**
-   * Returns a list of ranges specified for a given commandline parameter
-   * --<flag>=<key:value>,<key:value>*
-   */
-  void get_cmd_line_argument_ranges(const char* arg_name,
-                                    std::vector<std::vector<std::string> >& vals,
-                                    char delim = ',',
-                                    char sep = ':') const {
-    std::vector<std::string> ranges;
-    get_cmd_line_arguments(arg_name, ranges, delim);
-    for (std::vector<std::string>::const_iterator range = ranges.begin();
-      range != ranges.end(); ++range) {
-      std::vector<std::string> range_vals;
-      separate_string(*range, range_vals, sep);
-      vals.push_back(range_vals);
-    }
-  }
-  /**
-   * The number of pairs parsed
-   */
-  int parsed_argc() const { return (int)keys.size(); }
-  //-------------------------------------------------------------------------
-  // Utility functions
-  //-------------------------------------------------------------------------
-  /// Tokenizes a comma-delimited list of string pairs delimited by ':'
-  static void tokenize(std::vector<std::pair<std::string, std::string> >& tokens,
-                       std::string const& str,
-                       char delim = ',',
-                       char sep = ':') {
-    // Home-built to avoid Boost dependency
-    size_t s_idx = 0;
-    size_t d_idx = std::string::npos;
-    while (s_idx < str.size()) {
-      d_idx = str.find_first_of(delim, s_idx);
-      size_t end_idx = (d_idx != std::string::npos ? d_idx : str.size());
-      size_t sep_idx = str.find_first_of(sep, s_idx);
-      size_t offset = 1;
-      if (sep_idx == std::string::npos || sep_idx >= end_idx) {
-        sep_idx = end_idx;
-        offset = 0;
-      }
-      std::pair<std::string, std::string> item(
-          str.substr(s_idx, sep_idx - s_idx),
-          str.substr(sep_idx + offset, end_idx - sep_idx - offset));
-      tokens.push_back(item);
-      s_idx = end_idx + 1;
-    }
-  }
-  /// Tokenizes a comma-delimited list of string pairs delimited by ':'
-  static void tokenize(std::vector<std::string>& tokens,
-                       std::string const& str,
-                       char delim = ',',
-                       char sep = ':') {
-    typedef std::vector<std::pair<std::string, std::string> > TokenVector;
-    typedef TokenVector::const_iterator token_iterator;
-    std::vector<std::pair<std::string, std::string> > token_pairs;
-    tokenize(token_pairs, str, delim, sep);
-    for (token_iterator tok = token_pairs.begin(); tok != token_pairs.end(); ++tok) {
-      tokens.push_back(tok->first);
-    }
-  }
-  template <typename value_t>
-  static void separate_string(std::string const& str,
-                              std::vector<value_t>& vals,
-                              char sep = ',') {
-    std::istringstream str_stream(str);
-    std::string::size_type old_pos = 0;
-    std::string::size_type new_pos = 0;
-    // Iterate <sep>-delimited values
-    value_t val;
-    while ((new_pos = str.find(sep, old_pos)) != std::string::npos) {
-      if (new_pos != old_pos) {
-        str_stream.width(new_pos - old_pos);
-        str_stream >> val;
-        vals.push_back(val);
-      }
-      // skip over delimiter
-      str_stream.ignore(1);
-      old_pos = new_pos + 1;
-    }
-    // Read last value
-    str_stream >> val;
-    vals.push_back(val);
-  }
-};
-}  // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/cublas_wrappers.hpp DELETED Viewed

@@ -1,528 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-//-- BLAM_DEBUG_OUT ---------------------------------------------------------
-#ifdef BLAM_DEBUG
-# include <iostream>
-# ifndef BLAM_DEBUG_OUT
-#  define BLAM_DEBUG_OUT(msg)    std::cerr << "BLAM: " << msg << std::endl
-#  define BLAM_DEBUG_OUT_2(msg)  std::cerr << msg << std::endl
-# endif // BLAM_DEBUG_OUT
-#else
-# ifndef BLAM_DEBUG_OUT
-#  define BLAM_DEBUG_OUT(msg)
-#  define BLAM_DEBUG_OUT_2(msg)
-# endif // BLAM_DEBUG_OUT
-#endif // BLAM_DEBUG
-// User could potentially define ComplexFloat/ComplexDouble instead of std::
-#ifndef BLAM_COMPLEX_TYPES
-#define BLAM_COMPLEX_TYPES 1
-#include "cutlass/cutlass.h"
-#include CUDA_STD_HEADER(complex)
-namespace blam {
-template <typename T>
-using Complex       = cuda::std::complex<T>;
-using ComplexFloat  = cuda::std::complex<float>;
-using ComplexDouble = cuda::std::complex<double>;
-}
-#endif // BLAM_COMPLEX_TYPES
-// User could potentially define Half instead of cute::
-#ifndef BLAM_HALF_TYPE
-#define BLAM_HALF_TYPE 1
-#include <cute/numeric/numeric_types.hpp>
-namespace blam {
-using Half = cute::half_t;
-}
-#endif // BLAM_HALF_TYPE
-namespace blam
-{
-namespace cublas
-{
-inline const char*
-cublas_get_error(cublasStatus_t status)
-{
-  switch (status) {
-    case CUBLAS_STATUS_SUCCESS:
-      return "CUBLAS_STATUS_SUCCESS";
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "CUBLAS_STATUS_NOT_INITIALIZED -- The cuBLAS library was not initialized.";
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "CUBLAS_STATUS_ALLOC_FAILED -- Resource allocation failed inside the cuBLAS library.";
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return "CUBLAS_STATUS_INVALID_VALUE -- An unsupported value or parameter was passed to the function.";
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "CUBLAS_STATUS_ARCH_MISMATCH -- The function requires a feature absent from the device architecture.";
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "CUBLAS_STATUS_MAPPING_ERROR -- An access to GPU memory space failed.";
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "CUBLAS_STATUS_EXECUTION_FAILED -- The GPU program failed to execute.";
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "CUBLAS_STATUS_INTERNAL_ERROR -- An internal cuBLAS operation failed.";
-    case CUBLAS_STATUS_NOT_SUPPORTED:
-      return "CUBLAS_STATUS_NOT_SUPPORTED -- The functionality requested is not supported.";
-    case CUBLAS_STATUS_LICENSE_ERROR:
-      return "CUBLAS_STATUS_LICENSE_ERROR -- An error was detected when checking the current licensing.";
-    default:
-      return "CUBLAS_ERROR -- <unknown>";
-  }
-}
-inline bool
-cublas_is_error(cublasStatus_t status)
-{
-  return status != CUBLAS_STATUS_SUCCESS;
-}
-// hgemm
-inline cublasStatus_t
-gemm(cublasHandle_t handle,
-     cublasOperation_t transA, cublasOperation_t transB,
-     int m, int n, int k,
-     const Half* alpha,
-     const Half* A, int ldA,
-     const Half* B, int ldB,
-     const Half* beta,
-     Half* C, int ldC)
-{
-  BLAM_DEBUG_OUT("cublasHgemm");
-  return cublasGemmEx(handle, transA, transB,
-                      m, n, k,
-                      reinterpret_cast<const __half*>(alpha),
-                      reinterpret_cast<const __half*>(A), CUDA_R_16F, ldA,
-                      reinterpret_cast<const __half*>(B), CUDA_R_16F, ldB,
-                      reinterpret_cast<const __half*>(beta),
-                      reinterpret_cast<      __half*>(C), CUDA_R_16F, ldC,
-                      CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-}
-// mixed hf gemm
-inline cublasStatus_t
-gemm(cublasHandle_t handle,
-     cublasOperation_t transA, cublasOperation_t transB,
-     int m, int n, int k,
-     const float* alpha,
-     const Half* A, int ldA,
-     const Half* B, int ldB,
-     const float* beta,
-     float* C, int ldC)
-{
-  BLAM_DEBUG_OUT("cublasGemmEx mixed half-float");
-  return cublasGemmEx(handle, transA, transB,
-                      m, n, k,
-                      alpha,
-                      reinterpret_cast<const __half*>(A), CUDA_R_16F, ldA,
-                      reinterpret_cast<const __half*>(B), CUDA_R_16F, ldB,
-                      beta,
-                      C, CUDA_R_32F, ldC,
-                      CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-}
-// igemm
-inline cublasStatus_t
-gemm(cublasHandle_t handle,
-     cublasOperation_t transA, cublasOperation_t transB,
-     int m, int n, int k,
-     const int32_t* alpha,
-     const int8_t* A, int ldA,
-     const int8_t* B, int ldB,
-     const int32_t* beta,
-     int32_t* C, int ldC)
-{
-  BLAM_DEBUG_OUT("cublasIgemm");
-  return cublasGemmEx(handle, transA, transB,
-                      m, n, k,
-                      alpha,
-                      A, CUDA_R_8I, ldA,
-                      B, CUDA_R_8I, ldB,
-                      beta,
-                      C, CUDA_R_32I, ldC,
-                      CUDA_R_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-}
-// sgemm
-inline cublasStatus_t
-gemm(cublasHandle_t handle,
-     cublasOperation_t transA, cublasOperation_t transB,
-     int m, int n, int k,
-     const float* alpha,
-     const float* A, int ldA,
-     const float* B, int ldB,
-     const float* beta,
-     float* C, int ldC)
-{
-  BLAM_DEBUG_OUT("cublasSgemm");
-  return cublasSgemm(handle, transA, transB,
-                     m, n, k,
-                     alpha,
-                     A, ldA,
-                     B, ldB,
-                     beta,
-                     C, ldC);
-}
-// dgemm
-inline cublasStatus_t
-gemm(cublasHandle_t handle,
-     cublasOperation_t transA, cublasOperation_t transB,
-     int m, int n, int k,
-     const double* alpha,
-     const double* A, int ldA,
-     const double* B, int ldB,
-     const double* beta,
-     double* C, int ldC)
-{
-  BLAM_DEBUG_OUT("cublasDgemm");
-  return cublasDgemm(handle, transA, transB,
-                     m, n, k,
-                     alpha,
-                     A, ldA,
-                     B, ldB,
-                     beta,
-                     C, ldC);
-}
-// cgemm
-inline cublasStatus_t
-gemm(cublasHandle_t handle,
-     cublasOperation_t transA, cublasOperation_t transB,
-     int m, int n, int k,
-     const ComplexFloat* alpha,
-     const ComplexFloat* A, int ldA,
-     const ComplexFloat* B, int ldB,
-     const ComplexFloat* beta,
-     ComplexFloat* C, int ldC)
-{
-  BLAM_DEBUG_OUT("cublasCgemm");
-  return cublasCgemm(handle, transA, transB,
-                     m, n, k,
-                     reinterpret_cast<const cuFloatComplex*>(alpha),
-                     reinterpret_cast<const cuFloatComplex*>(A), ldA,
-                     reinterpret_cast<const cuFloatComplex*>(B), ldB,
-                     reinterpret_cast<const cuFloatComplex*>(beta),
-                     reinterpret_cast<cuFloatComplex*>(C), ldC);
-}
-// zgemm
-inline cublasStatus_t
-gemm(cublasHandle_t handle,
-     cublasOperation_t transA, cublasOperation_t transB,
-     int m, int n, int k,
-     const ComplexDouble* alpha,
-     const ComplexDouble* A, int ldA,
-     const ComplexDouble* B, int ldB,
-     const ComplexDouble* beta,
-     ComplexDouble* C, int ldC)
-{
-  BLAM_DEBUG_OUT("cublasZgemm");
-  return cublasZgemm(handle, transA, transB,
-                     m, n, k,
-                     reinterpret_cast<const cuDoubleComplex*>(alpha),
-                     reinterpret_cast<const cuDoubleComplex*>(A), ldA,
-                     reinterpret_cast<const cuDoubleComplex*>(B), ldB,
-                     reinterpret_cast<const cuDoubleComplex*>(beta),
-                     reinterpret_cast<cuDoubleComplex*>(C), ldC);
-}
-// hgemm
-inline cublasStatus_t
-gemm_batch(cublasHandle_t handle,
-           cublasOperation_t transA, cublasOperation_t transB,
-           int m, int n, int k,
-           const Half* alpha,
-           const Half* A, int ldA, int loA,
-           const Half* B, int ldB, int loB,
-           const Half* beta,
-           Half* C, int ldC, int loC,
-           int batch_size)
-{
-  BLAM_DEBUG_OUT("cublasHgemmStridedBatched");
-  return cublasHgemmStridedBatched(handle, transA, transB,
-                                   m, n, k,
-                                   reinterpret_cast<const __half*>(alpha),
-                                   reinterpret_cast<const __half*>(A), ldA, loA,
-                                   reinterpret_cast<const __half*>(B), ldB, loB,
-                                   reinterpret_cast<const __half*>(beta),
-                                   reinterpret_cast<__half*>(C), ldC, loC,
-                                   batch_size);
-}
-// sgemm
-inline cublasStatus_t
-gemm_batch(cublasHandle_t handle,
-           cublasOperation_t transA, cublasOperation_t transB,
-           int m, int n, int k,
-           const float* alpha,
-           const float* A, int ldA, int loA,
-           const float* B, int ldB, int loB,
-           const float* beta,
-           float* C, int ldC, int loC,
-           int batch_size)
-{
-  BLAM_DEBUG_OUT("cublasSgemmStridedBatched");
-  return cublasSgemmStridedBatched(handle, transA, transB,
-                                   m, n, k,
-                                   alpha,
-                                   A, ldA, loA,
-                                   B, ldB, loB,
-                                   beta,
-                                   C, ldC, loC,
-                                   batch_size);
-}
-// dgemm
-inline cublasStatus_t
-gemm_batch(cublasHandle_t handle,
-           cublasOperation_t transA, cublasOperation_t transB,
-           int m, int n, int k,
-           const double* alpha,
-           const double* A, int ldA, int loA,
-           const double* B, int ldB, int loB,
-           const double* beta,
-           double* C, int ldC, int loC,
-           int batch_size)
-{
-  BLAM_DEBUG_OUT("cublasDgemmStridedBatched");
-  return cublasDgemmStridedBatched(handle, transA, transB,
-                                   m, n, k,
-                                   alpha,
-                                   A, ldA, loA,
-                                   B, ldB, loB,
-                                   beta,
-                                   C, ldC, loC,
-                                   batch_size);
-}
-// cgemm
-inline cublasStatus_t
-gemm_batch(cublasHandle_t handle,
-           cublasOperation_t transA, cublasOperation_t transB,
-           int m, int n, int k,
-           const ComplexFloat* alpha,
-           const ComplexFloat* A, int ldA, int loA,
-           const ComplexFloat* B, int ldB, int loB,
-           const ComplexFloat* beta,
-           ComplexFloat* C, int ldC, int loC,
-           int batch_size)
-{
-  BLAM_DEBUG_OUT("cublasCgemmStridedBatched");
-  return cublasCgemmStridedBatched(handle, transA, transB,
-                                   m, n, k,
-                                   reinterpret_cast<const cuFloatComplex*>(alpha),
-                                   reinterpret_cast<const cuFloatComplex*>(A), ldA, loA,
-                                   reinterpret_cast<const cuFloatComplex*>(B), ldB, loB,
-                                   reinterpret_cast<const cuFloatComplex*>(beta),
-                                   reinterpret_cast<cuFloatComplex*>(C), ldC, loC,
-                                   batch_size);
-}
-// zgemm
-inline cublasStatus_t
-gemm_batch(cublasHandle_t handle,
-           cublasOperation_t transA, cublasOperation_t transB,
-           int m, int n, int k,
-           const ComplexDouble* alpha,
-           const ComplexDouble* A, int ldA, int loA,
-           const ComplexDouble* B, int ldB, int loB,
-           const ComplexDouble* beta,
-           ComplexDouble* C, int ldC, int loC,
-           int batch_size)
-{
-  BLAM_DEBUG_OUT("cublasZgemmStridedBatched");
-  return cublasZgemmStridedBatched(handle, transA, transB,
-                                   m, n, k,
-                                   reinterpret_cast<const cuDoubleComplex*>(alpha),
-                                   reinterpret_cast<const cuDoubleComplex*>(A), ldA, loA,
-                                   reinterpret_cast<const cuDoubleComplex*>(B), ldB, loB,
-                                   reinterpret_cast<const cuDoubleComplex*>(beta),
-                                   reinterpret_cast<cuDoubleComplex*>(C), ldC, loC,
-                                   batch_size);
-}
-// hgemm
-inline cublasStatus_t
-gemm_batch(cublasHandle_t handle,
-           cublasOperation_t transA, cublasOperation_t transB,
-           int m, int n, int k,
-           const Half* alpha,
-           const Half* const A[], int ldA,
-           const Half* const B[], int ldB,
-           const Half* beta,
-           Half* const C[], int ldC,
-           int batch_size)
-{
-  BLAM_DEBUG_OUT("cublasHgemmBatched");
-  return cublasHgemmBatched(handle, transA, transB,
-                            m, n, k,
-                            reinterpret_cast<const __half*>(alpha),
-                            reinterpret_cast<const __half**>(const_cast<const Half**>(A)), ldA,
-                            // A, ldA,   // cuBLAS 9.2
-                            reinterpret_cast<const __half**>(const_cast<const Half**>(B)), ldB,
-                            // B, ldB,   // cuBLAS 9.2
-                            reinterpret_cast<const __half*>(beta),
-                            reinterpret_cast<__half**>(const_cast<Half**>(C)), ldC,
-                            // C, ldC,   // cuBLAS 9.2
-                            batch_size);
-}
-// sgemm
-inline cublasStatus_t
-gemm_batch(cublasHandle_t handle,
-           cublasOperation_t transA, cublasOperation_t transB,
-           int m, int n, int k,
-           const float* alpha,
-           const float* const A[], int ldA,
-           const float* const B[], int ldB,
-           const float* beta,
-           float* const C[], int ldC,
-           int batch_size)
-{
-  BLAM_DEBUG_OUT("cublasSgemmBatched");
-  return cublasSgemmBatched(handle, transA, transB,
-                            m, n, k,
-                            alpha,
-                            const_cast<const float**>(A), ldA,
-                            // A, ldA,   // cuBLAS 9.2
-                            const_cast<const float**>(B), ldB,
-                            // B, ldB,   // cuBLAS 9.2
-                            beta,
-                            const_cast<float**>(C), ldC,
-                            // C, ldC,   // cuBLAS 9.2
-                            batch_size);
-}
-// dgemm
-inline cublasStatus_t
-gemm_batch(cublasHandle_t handle,
-           cublasOperation_t transA, cublasOperation_t transB,
-           int m, int n, int k,
-           const double* alpha,
-           const double* const A[], int ldA,
-           const double* const B[], int ldB,
-           const double* beta,
-           double* const C[], int ldC,
-           int batch_size)
-{
-  BLAM_DEBUG_OUT("cublasDgemmBatched");
-  return cublasDgemmBatched(handle, transA, transB,
-                            m, n, k,
-                            alpha,
-                            const_cast<const double**>(A), ldA,
-                            // A, ldA,   // cuBLAS 9.2
-                            const_cast<const double**>(B), ldB,
-                            // B, ldB,   // cuBLAS 9.2
-                            beta,
-                            const_cast<double**>(C), ldC,
-                            // C, ldC,   // cuBLAS 9.2
-                            batch_size);
-}
-// cgemm
-inline cublasStatus_t
-gemm_batch(cublasHandle_t handle,
-           cublasOperation_t transA, cublasOperation_t transB,
-           int m, int n, int k,
-           const ComplexFloat* alpha,
-           const ComplexFloat* const A[], int ldA,
-           const ComplexFloat* const B[], int ldB,
-           const ComplexFloat* beta,
-           ComplexFloat* const C[], int ldC,
-           int batch_size)
-{
-  BLAM_DEBUG_OUT("cublasCgemmBatched");
-  return cublasCgemmBatched(handle, transA, transB,
-                            m, n, k,
-                            reinterpret_cast<const cuFloatComplex*>(alpha),
-                            const_cast<const cuFloatComplex**>(reinterpret_cast<const cuFloatComplex* const *>(A)), ldA,
-                            //reinterpret_cast<const cuFloatComplex* const *>(A), ldA,  // cuBLAS 9.2
-                            const_cast<const cuFloatComplex**>(reinterpret_cast<const cuFloatComplex* const *>(B)), ldB,
-                            //reinterpret_cast<const cuFloatComplex* const *>(B), ldB,  // cuBLAS 9.2
-                            reinterpret_cast<const cuFloatComplex*>(beta),
-                            const_cast<cuFloatComplex**>(reinterpret_cast<cuFloatComplex* const *>(C)), ldC,
-                            //reinterpret_cast<cuFloatComplex* const *>(C), ldC,        // cuBLAS 9.2
-                            batch_size);
-}
-// zgemm
-inline cublasStatus_t
-gemm_batch(cublasHandle_t handle,
-           cublasOperation_t transA, cublasOperation_t transB,
-           int m, int n, int k,
-           const ComplexDouble* alpha,
-           const ComplexDouble* const A[], int ldA,
-           const ComplexDouble* const B[], int ldB,
-           const ComplexDouble* beta,
-           ComplexDouble* const C[], int ldC,
-           int batch_size)
-{
-  BLAM_DEBUG_OUT("cublasZgemmBatched");
-  return cublasZgemmBatched(handle, transA, transB,
-                            m, n, k,
-                            reinterpret_cast<const cuDoubleComplex*>(alpha),
-                            const_cast<const cuDoubleComplex**>(reinterpret_cast<const cuDoubleComplex* const *>(A)), ldA,
-                            //reinterpret_cast<const cuDoubleComplex* const *>(A), ldA,  // cuBLAS 9.2
-                            const_cast<const cuDoubleComplex**>(reinterpret_cast<const cuDoubleComplex* const *>(B)), ldB,
-                            //reinterpret_cast<const cuDoubleComplex* const *>(B), ldB,  // cuBLAS 9.2
-                            reinterpret_cast<const cuDoubleComplex*>(beta),
-                            const_cast<cuDoubleComplex**>(reinterpret_cast<cuDoubleComplex* const *>(C)), ldC,
-                            //reinterpret_cast<cuDoubleComplex* const *>(C), ldC,        // cuBLAS 9.2
-                            batch_size);
-}
-} // end namespace cublas
-} // end namespace blam

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/debug.h DELETED Viewed

@@ -1,143 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Contains code for debugging cutlass code
-*/
-#pragma once
-#include "device_dump.h"
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/******************************************************************************
- * Debug and logging macros
- ******************************************************************************/
-/**
- * Formats and prints the given message to stdout
- */
-#if !defined(CUDA_LOG)
-#if !defined(__CUDA_ARCH__)
-#define CUDA_LOG(format, ...) printf(format, __VA_ARGS__)
-#else
-#define CUDA_LOG(format, ...)                              \
-  printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
-         blockIdx.x,                                       \
-         blockIdx.y,                                       \
-         blockIdx.z,                                       \
-         threadIdx.x,                                      \
-         threadIdx.y,                                      \
-         threadIdx.z,                                      \
-         __VA_ARGS__);
-#endif
-#endif
-/**
- * Formats and prints the given message to stdout only if DEBUG is defined
- */
-#if !defined(CUDA_LOG_DEBUG)
-#ifdef DEBUG
-#define CUDA_LOG_DEBUG(format, ...) CUDA_LOG(format, __VA_ARGS__)
-#else
-#define CUDA_LOG_DEBUG(format, ...)
-#endif
-#endif
-/**
- * \brief The corresponding error message is printed to \p stderr (or \p stdout in device code)
- * along with the supplied source context.
- *
- * \return The CUDA error.
- */
-__host__ CUTLASS_DEVICE cudaError_t cuda_perror_impl(cudaError_t error,
-                                                     const char* expression,
-                                                     const char* filename,
-                                                     int line) {
-  (void)filename;
-  (void)line;
-  if (error) {
-#if !defined(__CUDA_ARCH__)
-    fprintf(
-        stderr, "CUDA error %d [%s, %d] in expression '%s': %s\n", error, filename, line, expression, cudaGetErrorString(error));
-    fflush(stderr);
-#else
-    printf("CUDA error %d [%s, %d] in expression '%s'\n", error, filename, line, expression);
-#endif
-  }
-  return error;
-}
-/**
- * \brief Perror macro
- */
-#ifndef CUDA_PERROR
-#define CUDA_PERROR(e) cuda_perror_impl((cudaError_t)(e), #e, __FILE__, __LINE__)
-#endif
-/**
- * \brief Perror macro with exit
- */
-#ifndef CUDA_PERROR_EXIT
-#define CUDA_PERROR_EXIT(e)                                     \
-  do { if (cuda_perror_impl((cudaError_t)(e), #e, __FILE__, __LINE__)) { \
-    exit(1);                                                    \
-  } } while (0)
-#endif
-/**
- * \brief Perror macro only if DEBUG is defined
- */
-#ifndef CUDA_PERROR_DEBUG
-#ifdef DEBUG
-#define CUDA_PERROR_DEBUG(e) CUDA_PERROR(e)
-#else
-#define CUDA_PERROR_DEBUG(e) (e)
-#endif
-#endif
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// A small helper class to dump a type at compile time
-// Usage:: DumpType<Class>::Class
-template <typename T>
-struct DebugType {};
-template <typename T>
-void DebugTypeFunc(T const& t) {
-  T::t;
-}
-// A small helper class to dump a compile time constant at compile time
-// Usage: DumpValue<Class::kConstant>::kConstant
-template <int Value>
-struct DebugValue {};

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_dump.h DELETED Viewed

@@ -1,187 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include <cstdio>
-#include "cutlass/cutlass.h"
-/**
- * \file
- * \brief C++ interface to dump fragments and shared memory contents for
- * debugging.
- */
-namespace cutlass {
-namespace debug {
-/******************************************************************************
- * Dump the fragments
- ******************************************************************************/
-/// The first N threads dump the first M elements from their fragments with a
-/// stride of S elements.  If N is not specified, dump the data of all the
-/// threads.  If M is not specified, dump all the elements of the fragment.
-template <typename Fragment>
-CUTLASS_DEVICE void dump_fragment(Fragment const& frag, int N = 0, int M = 0,
-                                  int S = 1) {
-  int total_threads = blockDim.x * blockDim.y * blockDim.z;
-  int block_id =
-      blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
-  int thread_id = (threadIdx.z * (blockDim.x * blockDim.y)) +
-                  (threadIdx.y * blockDim.x) + threadIdx.x;
-  if (N < 0 || N > total_threads) {
-    if (thread_id == 0 && block_id == 0)
-      printf("Thread number N = %d should between [1, %d].\n", N,
-             total_threads);
-    __syncthreads();
-    return;
-  }
-  int total_elements = int(frag.size());
-  if (M < 0 || M > total_elements) {
-    if (thread_id == 0 && block_id == 0)
-      printf("Element number M = %d should between [1, %d].\n", M,
-             total_elements);
-    __syncthreads();
-    return;
-  }
-  if (N == 0) N = total_threads;
-  if (M == 0) M = total_elements;
-  if (S < 1 || S > M) {
-    if (thread_id == 0 && block_id == 0)
-      printf("Stride S = %d should between [1, %d].\n", S, M);
-    __syncthreads();
-    return;
-  }
-  if (thread_id == 0 && block_id == 0)
-    printf("\n*******************Dumping the fragments*******************\n\n");
-  CUTLASS_PRAGMA_NO_UNROLL
-  for (int tid = 0; tid < N; ++tid) {
-    if (tid == thread_id) {
-      printf("TB%d W%d T%d: ", block_id, tid / 32, tid & 31);
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (int i = 0; i < M; i += S) {
-        printf("%.0f ", float(typename Fragment::value_type(frag[i])));
-      }
-      printf("\n");
-    }
-    __syncthreads();
-  }
-  if (thread_id == 0 && block_id == 0)
-    printf("\n***********************************************************\n\n");
-  __syncthreads();
-  return;
-}
-/******************************************************************************
- * Dump the shared memory
- ******************************************************************************/
-#define SHMEM_ROW_SIZE 128
-/// Dump the shared memory contents.  ptr is the begin address, size specifies
-/// the number of elements that need to be dumped, and S specifies the stride.
-template <typename Element>
-CUTLASS_DEVICE void dump_shmem(Element const* ptr, size_t size, int S = 1) {
-  int block_id =
-      blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
-  int thread_id = (threadIdx.z * (blockDim.x * blockDim.y)) +
-                  (threadIdx.y * blockDim.x) + threadIdx.x;
-  if (ptr == nullptr) {
-    if (thread_id == 0 && block_id == 0) printf("ptr is null.\n");
-    __syncthreads();
-    return;
-  }
-  if (size < 1) {
-    if (thread_id == 0 && block_id == 0)
-      printf("Element size is less than 1\n");
-    __syncthreads();
-    return;
-  }
-  int row_elements = SHMEM_ROW_SIZE / sizeof(Element);
-  if (S < 1 || S > row_elements) {
-    if (thread_id == 0 && block_id == 0)
-      printf("Stride S = %d should between [1, %d].\n", S, row_elements);
-    __syncthreads();
-    return;
-  }
-  __syncthreads();
-  if (thread_id == 0)
-    printf("\n********Dumping the shared memory of TB %d*******\n\n", block_id);
-  if (thread_id == 0) {
-    for (int i = 0; i < size; i += row_elements) {
-      for (int j = 0; j < row_elements; j += S) {
-        printf("%.0f ", float(ptr[i + j]));
-      }
-      printf("\n");
-    }
-  }
-  if (thread_id == 0)
-    printf("\n***********************************************************\n\n");
-  __syncthreads();
-  return;
-}
-}  // namespace debug
-}  // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_groupnorm.h DELETED Viewed

@@ -1,402 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-/**
- * \file
- * \brief cuda kernels to do group norm on a device memory tensor with NHWC layout. The tensor will be divided into [N, H, W, G, C'] and then we do normalization on [H, W, C'].
- */
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/tensor_ref.h"
-#include "device_utils.h"
-#include <cfloat>
-namespace cutlass {
-/** \brief interface to do group norm on a device memory tensor with NHWC layout.
- * \tparam T: data type
- */
-template <typename T>
-void groupnorm(cutlass::Tensor4DCoord input_size,
-               const int num_groups,
-               const float eps,
-               TensorRef<T, layout::TensorNHWC> ref_output,
-               TensorRef<T, layout::TensorNHWC> ref_input,
-               TensorRef<T, layout::TensorNHWC> ref_gamma,
-               TensorRef<T, layout::TensorNHWC> ref_beta,
-               cudaStream_t stream);
-extern __shared__ char groupnorm_shm[];
-// For small prod_dim1_to_last_dim/num_groups, to avoid multiple loads from global memory,
-// we store the input in the shared memory.
-// grid(num_groups, dim0)
-// block(BLOCKSIZE)
-// BLOCKSIZE * TVecs_PER_THREAD <= prod_dim1_to_last_dim/num_group
-template<typename TVec, typename T, int T_PER_TVec>
-__global__ void groupnorm_twopass_store_locally(T*          output,
-                                                const T*    input,
-                                                const T*    gamma,
-                                                const T*    beta,
-                                                int         num_groups,
-                                                int         prod_dim1_to_last_dim,
-                                                int         last_dim,
-                                                const float eps,
-                                                const int   TVecs_PER_THREAD)
-{
-    const int   bid               = blockIdx.y;   // index of batch
-    const int   gid               = blockIdx.x;   // index of group
-    const int   tid               = threadIdx.x;  // index of thread
-    const int   bdimx             = blockDim.x;
-    const int   s_reduce_elements = prod_dim1_to_last_dim / num_groups;
-    const int   v_reduce_elements = s_reduce_elements / T_PER_TVec;
-    const int   s_group_stride    = last_dim / num_groups;
-    const int   v_group_stride    = s_group_stride / T_PER_TVec;
-    const int   offset_of_group   = (bid * prod_dim1_to_last_dim + gid * s_group_stride) / T_PER_TVec;
-    const TVec* input_TVec_ptr    = (const TVec*)(input) + offset_of_group;
-    TVec*       output_TVec_ptr   = (TVec*)(output) + offset_of_group;
-    T*       local_val         = ((T*)groupnorm_shm) + TVecs_PER_THREAD * T_PER_TVec * tid;
-    float       local_sum[1]      = {0.0f};
-// load from global memory into shared memory
-#pragma unroll
-    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
-        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
-        const int offset_in_group =
-            ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
-            / T_PER_TVec;
-        if (current_load_start_idx < s_reduce_elements) {
-            TVec      tmp_vec          = input_TVec_ptr[offset_in_group];
-            T*        tmp_vec_ptr      = (T*)(&tmp_vec);
-            const int local_val_offset = i * T_PER_TVec;
-#pragma unroll
-            for (int j = 0; j < T_PER_TVec; j++) {
-                float tmp = static_cast<float>(tmp_vec_ptr[j]);
-                local_sum[0] += tmp;
-                local_val[local_val_offset + j] = tmp_vec_ptr[j];
-            }
-        }
-    }
-    __shared__ float s_mean, s_variance;
-    // reduction for mean
-    if (bdimx <= 32) {
-        warpReduceSum<float, 1>(local_sum);
-    }
-    else {
-        blockReduceSum<float, 1>(local_sum);
-    }
-    if (tid == 0) {
-        s_mean = local_sum[0] / s_reduce_elements;
-    }
-    __syncthreads();
-    // reduction for std
-    local_sum[0] = 0.0f;
-#pragma unroll
-    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
-        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
-        if (current_load_start_idx < s_reduce_elements) {
-            const int local_val_offset = i * T_PER_TVec;
-#pragma unroll
-            for (int j = 0; j < T_PER_TVec; j++) {
-                float tmp = static_cast<float>(local_val[local_val_offset + j]);
-                tmp -= s_mean;
-                local_sum[0] += tmp * tmp;
-            }
-        }
-    }
-    if (bdimx <= 32) {
-        warpReduceSum<float, 1>(local_sum);
-    }
-    else {
-        blockReduceSum<float, 1>(local_sum);
-    }
-    if (tid == 0) {
-        s_variance = rsqrtf(local_sum[0] / s_reduce_elements + eps);
-    }
-    __syncthreads();
-    // normalize
-    const int   gamma_offset_of_group = gid * v_group_stride;
-    const TVec* gamma_TVec_ptr        = (const TVec*)gamma + gamma_offset_of_group;
-    const TVec* beta_TVec_ptr         = (const TVec*)beta + gamma_offset_of_group;
-#pragma unroll
-    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
-        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
-        const int offset_in_group =
-            ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
-            / T_PER_TVec;
-        const int gamma_offset_in_group = (current_load_start_idx % s_group_stride) / T_PER_TVec;
-        const int local_val_offset      = i * T_PER_TVec;
-        if (current_load_start_idx < s_reduce_elements) {
-            TVec gamma_val     = gamma_TVec_ptr[gamma_offset_in_group];
-            TVec beta_val      = beta_TVec_ptr[gamma_offset_in_group];
-            T*   gamma_val_ptr = (T*)(&gamma_val);
-            T*   beta_val_ptr  = (T*)(&beta_val);
-            TVec tmp_vec;
-            T*   tmp_vec_ptr = (T*)(&tmp_vec);
-#pragma unroll
-            for (int j = 0; j < T_PER_TVec; j++) {
-                float tmp = (static_cast<float>(local_val[local_val_offset + j]) - s_mean) * s_variance
-                                * static_cast<float>(gamma_val_ptr[j])
-                            + static_cast<float>(beta_val_ptr[j]);
-                if (sizeof(T) == sizeof(half)) {
-                    tmp_vec_ptr[j] = T(__float2half_rn(tmp));
-                }
-                else {
-                    tmp_vec_ptr[j] = T(tmp);
-                }
-            }
-            output_TVec_ptr[offset_in_group] = tmp_vec;
-        }
-    }
-}
-// For large prod_dim1_to_last_dim/num_groups,
-// in which the data cannot be stored locally,
-// we will load from global memory multiple times,
-// grid(num_groups, dim0)
-// block(BLOCKSIZE)
-// BLOCKSIZE * TVecs_PER_THREAD <= prod_dim1_to_last_dim/num_group
-template<typename TVec, typename T, int T_PER_TVec>
-__global__ void groupnorm_twopass_multiple_load(T*          output,
-                                                const T*    input,
-                                                const T*    gamma,
-                                                const T*    beta,
-                                                int         num_groups,
-                                                int         prod_dim1_to_last_dim,
-                                                int         last_dim,
-                                                const float eps,
-                                                const int   TVecs_PER_THREAD)
-{
-    const int   bid               = blockIdx.y;   // index of batch
-    const int   gid               = blockIdx.x;   // index of group
-    const int   tid               = threadIdx.x;  // index of thread
-    const int   bdimx             = blockDim.x;
-    const int   s_reduce_elements = prod_dim1_to_last_dim / num_groups;
-    const int   v_reduce_elements = s_reduce_elements / T_PER_TVec;
-    const int   s_group_stride    = last_dim / num_groups;
-    const int   v_group_stride    = s_group_stride / T_PER_TVec;
-    const int   offset_of_group   = (bid * prod_dim1_to_last_dim + gid * s_group_stride) / T_PER_TVec;
-    const TVec* input_TVec_ptr    = (const TVec*)(input) + offset_of_group;
-    TVec*       output_TVec_ptr   = (TVec*)(output) + offset_of_group;
-    float       local_sum[1]      = {0.0f};
-#pragma unroll
-    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
-        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
-        if (current_load_start_idx < s_reduce_elements) {
-            const int offset_in_group =
-                ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
-                / T_PER_TVec;
-            TVec tmp_vec     = input_TVec_ptr[offset_in_group];
-            T*   tmp_vec_ptr = (T*)(&tmp_vec);
-#pragma unroll
-            for (int j = 0; j < T_PER_TVec; j++) {
-                float tmp = static_cast<float>(tmp_vec_ptr[j]);
-                local_sum[0] += tmp;
-            }
-        }
-    }
-    __shared__ float s_mean, s_variance;
-    // reduction for mean
-    if (bdimx <= 32) {
-        warpReduceSum<float, 1>(local_sum);
-    }
-    else {
-        blockReduceSum<float, 1>(local_sum);
-    }
-    if (tid == 0) {
-        s_mean = local_sum[0] / s_reduce_elements;
-    }
-    __syncthreads();
-    // reduction for std
-    local_sum[0] = 0.0f;
-#pragma unroll
-    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
-        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
-        if (current_load_start_idx < s_reduce_elements) {
-            const int offset_in_group =
-                ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
-                / T_PER_TVec;
-            TVec tmp_vec     = input_TVec_ptr[offset_in_group];
-            T*   tmp_vec_ptr = (T*)(&tmp_vec);
-#pragma unroll
-            for (int j = 0; j < T_PER_TVec; j++) {
-                float tmp = static_cast<float>(tmp_vec_ptr[j]);
-                tmp -= s_mean;
-                local_sum[0] += tmp * tmp;
-            }
-        }
-    }
-    if (bdimx <= 32) {
-        warpReduceSum<float, 1>(local_sum);
-    }
-    else {
-        blockReduceSum<float, 1>(local_sum);
-    }
-    if (tid == 0) {
-        s_variance = rsqrtf(local_sum[0] / s_reduce_elements + eps);
-    }
-    __syncthreads();
-    // normalize
-    const int   gamma_offset_of_group = gid * v_group_stride;
-    const TVec* gamma_TVec_ptr        = (const TVec*)gamma + gamma_offset_of_group;
-    const TVec* beta_TVec_ptr         = (const TVec*)beta + gamma_offset_of_group;
-#pragma unroll
-    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
-        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
-        if (current_load_start_idx < s_reduce_elements) {
-            const int offset_in_group =
-                ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
-                / T_PER_TVec;
-            const int gamma_offset_in_group = (current_load_start_idx % s_group_stride) / T_PER_TVec;
-            TVec      gamma_val             = gamma_TVec_ptr[gamma_offset_in_group];
-            TVec      beta_val              = beta_TVec_ptr[gamma_offset_in_group];
-            T*        gamma_val_ptr         = (T*)(&gamma_val);
-            T*        beta_val_ptr          = (T*)(&beta_val);
-            TVec      tmp_vec               = input_TVec_ptr[offset_in_group];
-            T*        tmp_vec_ptr           = (T*)(&tmp_vec);
-            TVec      output_tmp_vec;
-            T*        output_tmp_vec_ptr = (T*)(&output_tmp_vec);
-#pragma unroll
-            for (int j = 0; j < T_PER_TVec; j++) {
-                float tmp =
-                    (static_cast<float>(tmp_vec_ptr[j]) - s_mean) * s_variance * static_cast<float>(gamma_val_ptr[j])
-                    + static_cast<float>(beta_val_ptr[j]);
-                if (sizeof(T) == sizeof(half)) {
-                    output_tmp_vec_ptr[j] = T(__float2half_rn(tmp));
-                }
-                else {
-                    output_tmp_vec_ptr[j] = T(tmp);
-                }
-            }
-            output_TVec_ptr[offset_in_group] = output_tmp_vec;
-        }
-    }
-}
-//ref_input & ref_output should be [N, H, W, C]
-//ref_gamma & ref_beta should be [1, 1, 1, C]
-template <typename T>
-void groupnorm(cutlass::Tensor4DCoord input_size,
-               const int num_groups,
-               const float eps,
-               TensorRef<T, layout::TensorNHWC> ref_output,
-               TensorRef<T, layout::TensorNHWC> ref_input,
-               TensorRef<T, layout::TensorNHWC> ref_gamma,
-               TensorRef<T, layout::TensorNHWC> ref_beta,
-               cudaStream_t stream){
-  const int N = input_size.n();
-  const int H = input_size.h();
-  const int W = input_size.w();
-  const int C = input_size.c();
-  if (C % num_groups != 0){
-    printf("[ERROR] C should be a multiple of num_groups.\n");
-  }
-  T* output = ref_output.data();
-  const T* input = ref_input.data();
-  const T* gamma = ref_gamma.data();
-  const T* beta = ref_beta.data();
-  const int dim0 = N;
-  const int last_dim = C;
-  const int prod_dim1_to_last_dim = H*W*C;
-  const int s_reduce_elements = prod_dim1_to_last_dim / num_groups;
-  const int s_group_stride = last_dim / num_groups;
-  dim3      grid(num_groups, dim0);
-  int       threadblock_size = 32;
-  if (s_group_stride % 2 == 0) {
-    const int T_PER_TVec = 2;
-    while (threadblock_size < 1024) {
-      if (s_reduce_elements / T_PER_TVec / threadblock_size <= 8)
-        break;
-        threadblock_size *= 2;
-      }
-    dim3      block(threadblock_size);
-    const int TVec_PER_THREAD = (s_reduce_elements / T_PER_TVec + threadblock_size - 1) / threadblock_size;
-    const int shm_size = T_PER_TVec * TVec_PER_THREAD * threadblock_size * sizeof(T);
-    // for small s_reduce_elements, specific case for H=W=22, C=1280, num_groups=32;
-    // the size of grid & block may have better choice for different cases.
-    // ensure shared memory is smaller than 48KB
-    if (std::is_same<T, float>::value){
-      if (shm_size < 48 * 1024) {
-        groupnorm_twopass_store_locally<float2, T, T_PER_TVec><<<grid, block, shm_size, stream>>>(
-          output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
-      }
-      else {
-        groupnorm_twopass_multiple_load<float2, T, T_PER_TVec><<<grid, block, 0, stream>>>(
-          output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
-      }
-    }
-    else{
-      if (shm_size < 48 * 1024) {
-        groupnorm_twopass_store_locally<half2, T, T_PER_TVec><<<grid, block, shm_size, stream>>>(
-          output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
-      }
-      else {
-        groupnorm_twopass_multiple_load<half2, T, T_PER_TVec><<<grid, block, 0, stream>>>(
-          output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
-      }
-    }
-  }
-  else {
-    const int T_PER_TVec = 1;
-    while (threadblock_size < 1024) {
-      if (s_reduce_elements / T_PER_TVec / threadblock_size <= 8)
-        break;
-        threadblock_size *= 2;
-      }
-    dim3      block(threadblock_size);
-    const int TVec_PER_THREAD = (s_reduce_elements / T_PER_TVec + threadblock_size - 1) / threadblock_size;
-    const int shm_size = T_PER_TVec * TVec_PER_THREAD * threadblock_size * sizeof(T);
-    if (shm_size < 48 * 1024) {
-      groupnorm_twopass_store_locally<T, T, T_PER_TVec><<<grid, block, shm_size, stream>>>(
-        output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
-    }
-    else {
-      groupnorm_twopass_multiple_load<T, T, T_PER_TVec><<<grid, block, 0, stream>>>(
-        output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
-    }
-  }
-}
-} //namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_layernorm.h DELETED Viewed

@@ -1,644 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-/**
- * \file
- * \brief cuda kernels to do layernorm on a device memory tensor with RowMajor layout.
- */
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/tensor_ref.h"
-#include "device_utils.h"
-#include <cfloat>
-namespace cutlass {
-/** \brief interface to do layernorm on a device memory tensor with RowMajor layout.
- * \tparam T: data type
- */
-template <typename T>
-void layernorm(cutlass::MatrixCoord tensor_size,
-               TensorRef<T, layout::RowMajor> ref_output,
-               TensorRef<T, layout::RowMajor> ref_input,
-               TensorRef<T, layout::RowMajor> ref_gamma,
-               TensorRef<T, layout::RowMajor> ref_beta,
-               cudaStream_t stream);
-/**
- * output [m, n] row-major
- * input [m, n] row-major
- * gamma [n]
- * beta [n]
- * grid(m)
- * block(block_size) -- each block deals with n elements ; each thread deals with ITEM_PER_THREAD elements
-*/
-template<typename T, int ITEM_PER_THREAD>
-__global__ void layernorm_twoPassAlgo_stored_locally_e1(T* output,
-                                                        const T* input,
-                                                        const T* gamma,
-                                                        const T* beta,
-                                                        const int m,
-                                                        const int n)
-{
-  const int m_idx = blockIdx.x;
-  const int tid = threadIdx.x;
-  const int bdimx = blockDim.x;
-  __shared__ float s_mean, s_variance;
-  T local_val[ITEM_PER_THREAD];
-  float local_sums[1] = {0.0f};
-  int offset = m_idx * n;
-  input += offset;
-  output += offset;
-  const T zero = T(0.0f);
-  #pragma unroll
-  for (int i = 0 ; i < ITEM_PER_THREAD ; i++){
-    int index = tid + i*bdimx;
-    local_val[i] = index < n ? input[index] : zero;
-    local_sums[0] += static_cast<float>(local_val[i]);
-  }
-  if (blockDim.x <= 32) {
-    warpReduceSum<float, 1>(local_sums);
-  }
-  else {
-    blockReduceSum<float, 1>(local_sums);
-  }
-  if (threadIdx.x == 0) {
-    s_mean = local_sums[0] / n;
-  }
-  __syncthreads();
-  local_sums[0] = 0.0f;
-  #pragma unroll
-  for (int i = 0 ; i < ITEM_PER_THREAD ; i++){
-    int index = tid + i*bdimx;
-    if (index < n){
-      const float tmp = static_cast<float>(local_val[i]) - s_mean;
-      local_sums[0] += tmp * tmp;
-    }
-  }
-  if (blockDim.x <= 32) {
-    warpReduceSum<float, 1>(local_sums);
-  }
-  else {
-    blockReduceSum<float, 1>(local_sums);
-  }
-  if (threadIdx.x == 0) {
-    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
-  }
-  __syncthreads();
-  #pragma unroll
-  for (int i = 0 ; i < ITEM_PER_THREAD ; i++){
-    int index = tid + i*bdimx;
-    if (index < n) {
-      const T gamma_val = gamma[index];
-      const T beta_val = beta[index];
-      output[index] = T((static_cast<float>(local_val[i]) - s_mean) * s_variance * static_cast<float>(gamma_val) + static_cast<float>(beta_val));
-    }
-  }
-}
-/**
- * output [m, n] row-major
- * input [m, n] row-major
- * gamma [n]
- * beta [n]
- * grid(m)
- * block(block_size) -- each block deals with block_size*ITEM_PER_THREAD*2 elements;
-*/
-template<typename T2, typename T, int ITEM_PER_THREAD>
-__global__ void layernorm_twoPassAlgo_stored_locally_e2(T2* output,
-                                                        const T2* input,
-                                                        const T2* gamma,
-                                                        const T2* beta,
-                                                        const int m,
-                                                        const int n)
-{
-  const int m_idx = blockIdx.x;
-  const int tid = threadIdx.x;
-  const int bdimx = blockDim.x;
-  __shared__ float s_mean, s_variance;
-  float local_sums[1] = {0.0f};
-  T2 local_val[ITEM_PER_THREAD];
-  const int n_2 = n / 2;
-  int offset = m_idx * n_2;
-  input += offset;
-  output += offset;
-  const T2 zero = {T(0.0f), T(0.0f)};
-  #pragma UNROLL
-  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
-    const int index = i*bdimx + tid;
-    local_val[i] = index < n_2 ? input[index] : zero;
-    local_sums[0] += static_cast<float>(local_val[i].x) + static_cast<float>(local_val[i].y);
-  }
-  if (blockDim.x <= 32) {
-    warpReduceSum<float, 1>(local_sums);
-  }
-  else {
-    blockReduceSum<float, 1>(local_sums);
-  }
-  if (threadIdx.x == 0) {
-    s_mean = local_sums[0] / n;
-  }
-  __syncthreads();
-  local_sums[0] = 0.0f;
-  #pragma UNROLL
-  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
-    const int index = i*bdimx + tid;
-    if (index < n_2){
-      const float2 tmp = {static_cast<float>(local_val[i].x) - s_mean,
-                          static_cast<float>(local_val[i].y) - s_mean};
-      local_sums[0] += tmp.x * tmp.x + tmp.y * tmp.y;
-    }
-  }
-  if (blockDim.x <= 32) {
-    warpReduceSum<float, 1>(local_sums);
-  }
-  else {
-    blockReduceSum<float, 1>(local_sums);
-  }
-  if (threadIdx.x == 0) {
-    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
-  }
-  __syncthreads();
-  #pragma UNROLL
-  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
-    const int index = i*bdimx + tid;
-    if (index < n_2){
-      const T2 gamma_val = gamma[index];
-      const T2 beta_val = beta[index];
-      T2 tmp;
-      tmp.x = T((static_cast<float>(local_val[i].x) - s_mean)*s_variance*static_cast<float>(gamma_val.x) + static_cast<float>(beta_val.x));
-      tmp.y = T((static_cast<float>(local_val[i].y) - s_mean)*s_variance*static_cast<float>(gamma_val.y) + static_cast<float>(beta_val.y));
-      output[index] = tmp;
-    }
-  }
-}
-/**
- * output [m, n] row-major
- * input [m, n] row-major
- * gamma [n]
- * beta [n]
- * grid(m)
- * block(block_size) -- each block deals with block_size*ITEM_PER_THREAD*4 elements;
-*/
-template<typename T4, typename T, int ITEM_PER_THREAD>
-__global__ void layernorm_twoPassAlgo_stored_locally_e4(T4* output,
-                                                        const T4* input,
-                                                        const T4* gamma,
-                                                        const T4* beta,
-                                                        const int m,
-                                                        const int n)
-{
-  const int m_idx = blockIdx.x;
-  const int tid = threadIdx.x;
-  const int bdimx = blockDim.x;
-  __shared__ float s_mean, s_variance;
-  float local_sums[1] = {0.0f};
-  T4 local_val[ITEM_PER_THREAD];
-  const int n_4 = n / 4;
-  int offset = m_idx * n_4;
-  input += offset;
-  output += offset;
-  const T4 zero = {T(0.0f), T(0.0f), T(0.0f), T(0.0f)};
-  #pragma UNROLL
-  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
-    const int index = i*bdimx + tid;
-    local_val[i] = index < n_4 ? input[index] : zero;
-    local_sums[0] += static_cast<float>(local_val[i].x) + static_cast<float>(local_val[i].y) +
-                     static_cast<float>(local_val[i].z) + static_cast<float>(local_val[i].w);
-  }
-  if (blockDim.x <= 32) {
-    warpReduceSum<float, 1>(local_sums);
-  }
-  else {
-    blockReduceSum<float, 1>(local_sums);
-  }
-  if (threadIdx.x == 0) {
-    s_mean = local_sums[0] / n;
-  }
-  __syncthreads();
-  local_sums[0] = 0.0f;
-  #pragma UNROLL
-  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
-    const int index = i*bdimx + tid;
-    if (index < n_4){
-      const float4 tmp = {static_cast<float>(local_val[i].x) - s_mean,
-                          static_cast<float>(local_val[i].y) - s_mean,
-                          static_cast<float>(local_val[i].z) - s_mean,
-                          static_cast<float>(local_val[i].w) - s_mean};
-      local_sums[0] += tmp.x * tmp.x + tmp.y * tmp.y + tmp.z * tmp.z + tmp.w * tmp.w;
-    }
-  }
-  if (blockDim.x <= 32) {
-    warpReduceSum<float, 1>(local_sums);
-  }
-  else {
-    blockReduceSum<float, 1>(local_sums);
-  }
-  if (threadIdx.x == 0) {
-    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
-  }
-  __syncthreads();
-  #pragma UNROLL
-  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
-    const int index = i*bdimx + tid;
-    if (index < n_4){
-      const T4 gamma_val = gamma[index];
-      const T4 beta_val = beta[index];
-      T4 tmp;
-      tmp.x = T((static_cast<float>(local_val[i].x) - s_mean)*s_variance*static_cast<float>(gamma_val.x) + static_cast<float>(beta_val.x));
-      tmp.y = T((static_cast<float>(local_val[i].y) - s_mean)*s_variance*static_cast<float>(gamma_val.y) + static_cast<float>(beta_val.y));
-      tmp.z = T((static_cast<float>(local_val[i].z) - s_mean)*s_variance*static_cast<float>(gamma_val.z) + static_cast<float>(beta_val.z));
-      tmp.w = T((static_cast<float>(local_val[i].w) - s_mean)*s_variance*static_cast<float>(gamma_val.w) + static_cast<float>(beta_val.w));
-      output[index] = tmp;
-    }
-  }
-}
-/**
- * output [m, n] row-major
- * input [m, n] row-major
- * gamma [n]
- * beta [n]
- * grid(m)
- * block(block_size) -- each block deals with n elements ; each thread deals with ITEM_PER_THREAD elements
-*/
-template<typename T>
-__global__ void layernorm_twoPassAlgo_e1(T* output,
-                                         const T* input,
-                                         const T* gamma,
-                                         const T* beta,
-                                         const int m,
-                                         const int n)
-{
-  const int m_idx = blockIdx.x;
-  const int tid = threadIdx.x;
-  const int bdimx = blockDim.x;
-  __shared__ float s_mean, s_variance;
-  float local_sums[1] = {0.0f};
-  int offset = m_idx * n;
-  input += offset;
-  output += offset;
-  for (int index = tid ; index < n ; index += bdimx){
-    float local_val = static_cast<float>(input[index]);
-    local_sums[0] += local_val;
-  }
-  if (blockDim.x <= 32) {
-    warpReduceSum<float, 1>(local_sums);
-  }
-  else {
-    blockReduceSum<float, 1>(local_sums);
-  }
-  if (threadIdx.x == 0) {
-    s_mean = local_sums[0] / n;
-  }
-  __syncthreads();
-  local_sums[0] = 0.0f;
-  for (int index = tid ; index < n ; index += bdimx){
-    float local_val = static_cast<float>(input[index]);
-    local_val = local_val - s_mean;
-    local_sums[0] += local_val * local_val;
-  }
-  if (blockDim.x <= 32) {
-    warpReduceSum<float, 1>(local_sums);
-  }
-  else {
-    blockReduceSum<float, 1>(local_sums);
-  }
-  if (threadIdx.x == 0) {
-    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
-  }
-  __syncthreads();
-  for (int index = tid ; index < n ; index += bdimx){
-    const T gamma_val = gamma[index];
-    const T beta_val = beta[index];
-    const T local_val = input[index];
-    output[index] = T((static_cast<float>(local_val) - s_mean) * s_variance * static_cast<float>(gamma_val) + static_cast<float>(beta_val));
-  }
-}
-/**
- * output [m, n] row-major
- * input [m, n] row-major
- * gamma [n]
- * beta [n]
- * grid(m)
- * block(block_size) -- each block deals with block_size*ITEM_PER_THREAD*2 elements;
-*/
-template<typename T2, typename T>
-__global__ void layernorm_twoPassAlgo_e2(T2* output,
-                                         const T2* input,
-                                         const T2* gamma,
-                                         const T2* beta,
-                                         const int m,
-                                         const int n)
-{
-  const int m_idx = blockIdx.x;
-  const int tid = threadIdx.x;
-  const int bdimx = blockDim.x;
-  __shared__ float s_mean, s_variance;
-  float local_sums[1] = {0.0f};
-  const int n_2 = n / 2;
-  int offset = m_idx * n_2;
-  input += offset;
-  output += offset;
-  for (int index = tid; index < n_2; index += bdimx) {
-    const T2 local_val = input[index];
-    local_sums[0] += static_cast<float>(local_val.x) + static_cast<float>(local_val.y);
-  }
-  if (blockDim.x <= 32) {
-    warpReduceSum<float, 1>(local_sums);
-  }
-  else {
-    blockReduceSum<float, 1>(local_sums);
-  }
-  if (threadIdx.x == 0) {
-    s_mean = local_sums[0] / n;
-  }
-  __syncthreads();
-  local_sums[0] = 0.0f;
-  for (int index = tid; index < n_2; index += bdimx) {
-    const T2 local_val = input[index];
-    const float2 tmp = {static_cast<float>(local_val.x) - s_mean,
-                        static_cast<float>(local_val.y) - s_mean};
-    local_sums[0] += tmp.x * tmp.x + tmp.y * tmp.y;
-  }
-  if (blockDim.x <= 32) {
-    warpReduceSum<float, 1>(local_sums);
-  }
-  else {
-    blockReduceSum<float, 1>(local_sums);
-  }
-  if (threadIdx.x == 0) {
-    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
-  }
-  __syncthreads();
-  for (int index = tid; index < n_2; index += bdimx) {
-    const T2 local_val = input[index];
-    const T2 gamma_val = gamma[index];
-    const T2 beta_val = beta[index];
-    T2 tmp;
-    tmp.x = T((static_cast<float>(local_val.x) - s_mean)*s_variance*static_cast<float>(gamma_val.x) + static_cast<float>(beta_val.x));
-    tmp.y = T((static_cast<float>(local_val.y) - s_mean)*s_variance*static_cast<float>(gamma_val.y) + static_cast<float>(beta_val.y));
-    output[index] = tmp;
-  }
-}
-template <typename T>
-void layernorm(cutlass::MatrixCoord tensor_size,
-               TensorRef<T, layout::RowMajor> ref_output,
-               TensorRef<T, layout::RowMajor> ref_input,
-               TensorRef<T, layout::RowMajor> ref_gamma,
-               TensorRef<T, layout::RowMajor> ref_beta,
-               cudaStream_t stream){
-  const int m = tensor_size.row();
-  const int n = tensor_size.column();
-  T* output = ref_output.data();
-  const T* input = ref_input.data();
-  const T* gamma = ref_gamma.data();
-  const T* beta = ref_beta.data();
-  dim3 grid(m);
-  dim3 block((n + 31)/32*32);
-  if (block.x > 1024){
-    block.x = 1024;
-  }
-  // TODO : There should be better configs for different cases, we only use several samples to show how to use here
-  // TODO : using registers to store values locally can reduce the loads from global memory and speedup the kernels.
-  if ((n % 4 == 0) && (n >= 128) && (n <= 4096)) {
-    block.x = (n/4 + 31)/32*32;
-    if (std::is_same<T, float>::value) {
-      layernorm_twoPassAlgo_stored_locally_e4<float4, float, 1><<<grid, block, 0, stream>>>(
-        (float4*)output,
-        (const float4*)input,
-        (const float4*)gamma,
-        (const float4*)beta,
-        m,
-        n);
-    } // if (std::is_same<T, float>::value)
-    else {
-      layernorm_twoPassAlgo_stored_locally_e4<half4, half, 1><<<grid, block, 0, stream>>>(
-        (half4*)output,
-        (const half4*)input,
-        (const half4*)gamma,
-        (const half4*)beta,
-        m,
-        n);
-    }
-  } //if ((n % 4 == 0) && (n >= 128) && (n <= 4096))
-  else if (n % 2 == 0) {
-    if (n / 2 <= 1024) {
-      block.x = (n/2 + 31)/32*32;
-      if (std::is_same<T, float>::value) {
-        layernorm_twoPassAlgo_stored_locally_e2<float2, float, 1><<<grid, block, 0, stream>>>(
-          (float2*)output,
-          (const float2*)input,
-          (const float2*)gamma,
-          (const float2*)beta,
-          m,
-          n);
-      } //if (std::is_same<T, float>::value)
-      else {
-        layernorm_twoPassAlgo_stored_locally_e2<half2, half, 1><<<grid, block, 0, stream>>>(
-          (half2*)output,
-          (const half2*)input,
-          (const half2*)gamma,
-          (const half2*)beta,
-          m,
-          n);
-      }
-    } // if (n / 2 <= 1024)
-    else if (n <= 8192) {
-      block.x = ((n + 7)/8 + 31)/32*32;
-      if (std::is_same<T, float>::value) {
-        layernorm_twoPassAlgo_stored_locally_e2<float2, float, 4><<<grid, block, 0, stream>>>(
-          (float2*)output,
-          (const float2*)input,
-          (const float2*)gamma,
-          (const float2*)beta,
-          m,
-          n);
-      } // if (std::is_same<T, float>::value)
-      else {
-        layernorm_twoPassAlgo_stored_locally_e2<half2, half, 4><<<grid, block, 0, stream>>>(
-          (half2*)output,
-          (const half2*)input,
-          (const half2*)gamma,
-          (const half2*)beta,
-          m,
-          n);
-      }
-    } // if (n <= 8192)
-    else if (n <= 16384) {
-      block.x = ((n + 15)/ 16 + 31)/32*32;
-      if (std::is_same<T, float>::value) {
-        layernorm_twoPassAlgo_stored_locally_e2<float2, float, 8><<<grid, block, 0, stream>>>(
-          (float2*)output,
-          (const float2*)input,
-          (const float2*)gamma,
-          (const float2*)beta,
-          m,
-          n);
-      } // if (std::is_same<T, float>::value)
-      else {
-        layernorm_twoPassAlgo_stored_locally_e2<half2, half, 8><<<grid, block, 0, stream>>>(
-          (half2*)output,
-          (const half2*)input,
-          (const half2*)gamma,
-          (const half2*)beta,
-          m,
-          n);
-      }
-    } // if (n <= 16384)
-    else if (n <= 32768) {
-      block.x = ((n + 31)/32 + 31)/32*32;
-      if (std::is_same<T, float>::value) {
-        layernorm_twoPassAlgo_stored_locally_e2<float2, float, 16><<<grid, block, 0, stream>>>(
-          (float2*)output,
-          (const float2*)input,
-          (const float2*)gamma,
-          (const float2*)beta,
-          m,
-          n);
-      } // if (std::is_same<T, float>::value)
-      else {
-        layernorm_twoPassAlgo_stored_locally_e2<half2, half, 16><<<grid, block, 0, stream>>>(
-          (half2*)output,
-          (const half2*)input,
-          (const half2*)gamma,
-          (const half2*)beta,
-          m,
-          n);
-      }
-    } // if (n <= 32768)
-    else {
-      if (block.x > 512)
-        block.x = 512;
-      if (std::is_same<T, float>::value) {
-        layernorm_twoPassAlgo_e2<float2, float><<<grid, block, 0, stream>>>(
-          (float2 *)output,
-          (const float2 *)input,
-          (const float2 *)gamma,
-          (const float2 *)beta,
-          m,
-          n);
-      } // if (std::is_same<T, float>::value)
-      else {
-        layernorm_twoPassAlgo_e2<half2, half><<<grid, block, 0, stream>>>(
-          (half2 *)output,
-          (const half2 *)input,
-          (const half2 *)gamma,
-          (const half2 *)beta,
-          m,
-          n);
-      }
-    }
-  } // if (n % 2 == 0)
-  else {
-    if (n <= 1024) {
-      layernorm_twoPassAlgo_stored_locally_e1<T, 1><<<grid, block, 0, stream>>>(
-        output,
-        input,
-        gamma,
-        beta,
-        m,
-        n);
-    } // if (n <= 1024)
-    else if (n <= 8192) {
-      block.x = ((n + 7)/8 + 31)/32*32;
-      layernorm_twoPassAlgo_stored_locally_e1<T, 8><<<grid, block, 0, stream>>>(
-        output,
-        input,
-        gamma,
-        beta,
-        m,
-        n);
-    } // if (n <= 8192)
-    else if (n <= 16384) {
-      block.x = ((n + 15)/16 + 32)/32*32;
-      layernorm_twoPassAlgo_stored_locally_e1<T, 16><<<grid, block, 0, stream>>>(
-        output,
-        input,
-        gamma,
-        beta,
-        m,
-        n);
-    } // if (n <= 16384)
-    else if (n <= 32768) {
-      block.x = ((n + 31)/32 + 31)/32*32;
-      layernorm_twoPassAlgo_stored_locally_e1<T, 32><<<grid, block, 0, stream>>>(
-        output,
-        input,
-        gamma,
-        beta,
-        m,
-        n);
-    } // if (n <= 32768)
-    else{
-      if (block.x > 512) {
-        block.x = 512;
-      }
-      layernorm_twoPassAlgo_e1<<<grid, block, 0, stream>>>(
-        output,
-        input,
-        gamma,
-        beta,
-        m,
-        n);
-    }
-  }
-}
-} //namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_memory.h DELETED Viewed

@@ -1,375 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-/**
- * \file
- * \brief C++ interface to CUDA device memory management functions.
- */
-#include <memory>
-#include <sstream>
-#include "cutlass/platform/platform.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/trace.h"
-#include "exceptions.h"
-namespace cutlass {
-namespace device_memory {
-/******************************************************************************
- * Allocation lifetime
- ******************************************************************************/
-/// Allocate a buffer of \p count elements of type \p T on the current CUDA device
-template <typename T>
-T* allocate(size_t count = 1) {
-  T* ptr = 0;
-  size_t bytes = count * sizeof_bits<T>::value / 8;
-  cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
-  if (cuda_error != cudaSuccess) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 0)
-    std::ostringstream os;
-    os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes=" << bytes;
-    CUTLASS_TRACE_HOST(os.str());
-#endif
-    throw cuda_exception("Failed to allocate memory", cuda_error);
-  }
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-  else {
-    std::ostringstream os;
-    os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes=" << bytes;
-    CUTLASS_TRACE_HOST(os.str());
-  }
-#endif
-  return ptr;
-}
-/// Free the buffer pointed to by \p ptr
-template <typename T>
-void free(T* ptr) {
-  if (ptr) {
-    cudaError_t cuda_error = (cudaFree(ptr));
-    if (cuda_error != cudaSuccess) {
-      throw cuda_exception("Failed to free device memory", cuda_error);
-    }
-  }
-}
-/******************************************************************************
- * Data movement
- ******************************************************************************/
-template <typename T>
-void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
-  size_t bytes = count * sizeof_bits<T>::value / 8;
-  if (bytes == 0 && count > 0) {
-    bytes = 1;
-  }
-  cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
-  if (cuda_error != cudaSuccess) {
-    std::ostringstream os;
-    os << "cutlass::device_memory::copy: cudaMemcpy() failed: "
-       << "dst=" << dst << ", src=" << src
-       << ", bytes=" << bytes << ", count=" << count;
-    if (kind == cudaMemcpyHostToDevice) {
-      os << ", kind=cudaMemcpyHostToDevice";
-    }
-    else if (kind == cudaMemcpyDeviceToHost) {
-      os << ", kind=cudaMemcpyDeviceToHost";
-    }
-    else if (kind == cudaMemcpyDeviceToDevice) {
-      os << ", kind=cudaMemcpyDeviceToDevice";
-    }
-    else if (kind == cudaMemcpyHostToHost) {
-      os << ", kind=cudaMemcpyHostToHost";
-    }
-    else if (kind == cudaMemcpyDefault) {
-      os << ", kind=cudaMemcpyDefault";
-    }
-    else {
-      os << ", kind=Unknown";
-    }
-    os << ", error: " << cudaGetErrorString(cuda_error);
-    throw cuda_exception(os.str().c_str(), cuda_error);
-  }
-}
-template <typename T>
-void copy_to_device(T* dst, T const* src, size_t count = 1) {
-  copy(dst, src, count, cudaMemcpyHostToDevice);
-}
-template <typename T>
-void copy_to_host(T* dst, T const* src, size_t count = 1) {
-  copy(dst, src, count, cudaMemcpyDeviceToHost);
-}
-template <typename T>
-void copy_device_to_device(T* dst, T const* src, size_t count = 1) {
-  copy(dst, src, count, cudaMemcpyDeviceToDevice);
-}
-template <typename T>
-void copy_host_to_host(T* dst, T const* src, size_t count = 1) {
-  copy(dst, src, count, cudaMemcpyHostToHost);
-}
-/// Copies elements from device memory to host-side range
-template <typename OutputIterator, typename T>
-void insert_to_host(OutputIterator begin, OutputIterator end, T const* device_begin) {
-  size_t elements = end - begin;
-  copy_to_host(&*begin, device_begin, elements);
-}
-/// Copies elements to device memory from host-side range
-template <typename T, typename InputIterator>
-void insert_to_device(T* device_begin, InputIterator begin, InputIterator end) {
-  size_t elements = end - begin;
-  copy_to_device(device_begin, &*begin, elements);
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-}  // namespace device_memory
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <typename T>
-class DeviceAllocation {
-public:
-  /// Delete functor for CUDA device memory
-  struct deleter {
-    void operator()(T* ptr) {
-      cudaError_t cuda_error = (cudaFree(ptr));
-      if (cuda_error != cudaSuccess) {
-        // noexcept
-        //                throw cuda_exception("cudaFree() failed", cuda_error);
-        return;
-      }
-    }
-  };
-public:
-  //
-  // Data members
-  //
-  /// Number of elements of T allocated on the current CUDA device
-  size_t capacity;
-  /// Smart pointer
-  platform::unique_ptr<T, deleter> smart_ptr;
-public:
-  //
-  // Static methods
-  //
-  /// Static member to compute the number of bytes needed for a given number of elements
-  static size_t bytes(size_t elements) {
-    if (sizeof_bits<T>::value < 8) {
-      size_t const kElementsPerByte = 8 / sizeof_bits<T>::value;
-      return elements / kElementsPerByte;
-    }
-    else {
-      size_t const kBytesPerElement = sizeof_bits<T>::value / 8;
-      return elements * kBytesPerElement;
-    }
-  }
-public:
-  //
-  // Methods
-  //
-  /// Constructor: allocates no memory
-  DeviceAllocation() : capacity(0) {}
-  /// Constructor: allocates \p capacity elements on the current CUDA device
-  DeviceAllocation(size_t _capacity) :
-    smart_ptr(device_memory::allocate<T>(_capacity)), capacity(_capacity) {}
-  /// Constructor: allocates \p capacity elements on the current CUDA device taking ownership of the allocation
-  DeviceAllocation(T *ptr, size_t _capacity) : smart_ptr(ptr), capacity(_capacity) {}
-  /// Copy constructor
-  DeviceAllocation(DeviceAllocation const &p):
-    smart_ptr(device_memory::allocate<T>(p.capacity)), capacity(p.capacity) {
-    device_memory::copy_device_to_device(smart_ptr.get(), p.get(), capacity);
-  }
-  /// Move constructor
-  DeviceAllocation(DeviceAllocation &&p): capacity(0) {
-    std::swap(smart_ptr, p.smart_ptr);
-    std::swap(capacity, p.capacity);
-  }
-  /// Destructor
-  ~DeviceAllocation() { reset(); }
-  /// Returns a pointer to the managed object
-  T* get() const { return smart_ptr.get(); }
-  /// Releases the ownership of the managed object (without deleting) and resets capacity to zero
-  T* release() {
-    capacity = 0;
-    return smart_ptr.release();
-  }
-  /// Deletes the managed object and resets capacity to zero
-  void reset() {
-    capacity = 0;
-    smart_ptr.reset();
-  }
-  /// Deletes managed object, if owned, and allocates a new object
-  void reset(size_t _capacity) {
-    reset(device_memory::allocate<T>(_capacity), _capacity);
-  }
-  /// Deletes managed object, if owned, and replaces its reference with a given pointer and capacity
-  void reset(T* _ptr, size_t _capacity) {
-    smart_ptr.reset(_ptr);
-    capacity = _capacity;
-  }
-  /// Allocates a new buffer and copies the old buffer into it. The old buffer is then released.
-  void reallocate(size_t new_capacity) {
-    platform::unique_ptr<T, deleter> new_allocation(device_memory::allocate<T>(new_capacity));
-    device_memory::copy_device_to_device(
-      new_allocation.get(),
-      smart_ptr.get(),
-      std::min(new_capacity, capacity));
-    std::swap(smart_ptr, new_allocation);
-    std::swap(new_capacity, capacity);
-  }
-  /// Returns the number of elements
-  size_t size() const {
-    return capacity;
-  }
-  /// Returns the number of bytes needed to store the allocation
-  size_t bytes() const {
-    return bytes(capacity);
-  }
-  /// Returns a pointer to the object owned by *this
-  T* operator->() const { return smart_ptr.get(); }
-  /// Returns the deleter object which would be used for destruction of the managed object.
-  deleter& get_deleter() { return smart_ptr.get_deleter(); }
-  /// Returns the deleter object which would be used for destruction of the managed object (const)
-  const deleter& get_deleter() const { return smart_ptr.get_deleter(); }
-  /// Copies a device-side memory allocation
-  DeviceAllocation & operator=(DeviceAllocation const &p) {
-    if (capacity != p.capacity) {
-      smart_ptr.reset(device_memory::allocate<T>(p.capacity));
-      capacity = p.capacity;
-    }
-    device_memory::copy_device_to_device(smart_ptr.get(), p.get(), capacity);
-    return *this;
-  }
-  /// Move assignment
-  DeviceAllocation & operator=(DeviceAllocation && p) {
-    std::swap(smart_ptr, p.smart_ptr);
-    std::swap(capacity, p.capacity);
-    return *this;
-  }
-  /// Copies the entire allocation from another location in device memory.
-  void copy_from_device(T const *ptr) const {
-    copy_from_device(ptr, capacity);
-  }
-  /// Copies a given number of elements from device memory
-  void copy_from_device(T const *ptr, size_t elements) const {
-    device_memory::copy_device_to_device(get(), ptr, elements);
-  }
-  void copy_to_device(T *ptr) const {
-    copy_to_device(ptr, capacity);
-  }
-  void copy_to_device(T *ptr, size_t elements) const {
-    device_memory::copy_device_to_device(ptr, get(), elements);
-  }
-  void copy_from_host(T const *ptr) const {
-    copy_from_host(ptr, capacity);
-  }
-  void copy_from_host(T const *ptr, size_t elements) const {
-    device_memory::copy_to_device(get(), ptr, elements);
-  }
-  void copy_to_host(T *ptr) const {
-    copy_to_host(ptr, capacity);
-  }
-  void copy_to_host(T *ptr, size_t elements) const {
-    device_memory::copy_to_host(ptr, get(), elements);
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace device_memory {
-/// Device allocation abstraction that tracks size and capacity
-template <typename T>
-using allocation = cutlass::DeviceAllocation<T>;
-}  // namespace device_memory
-/////////////////////////////////////////////////////////////////////////////////////////////////
-}  // namespace cutlass
-/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nchw_to_nhwc.h DELETED Viewed

@@ -1,141 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-/**
- * \file
- * \brief cuda kernels to transform a device memory tensor from NCHW layout to NHWC layout.
- */
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/tensor_ref.h"
-namespace cutlass {
-/** \brief interface to transform a device memory tensor from NCHW layout to NHWC layout.
- * \tparam T: data type
- */
-template <typename T>
-void nchw_to_nhwc(cutlass::Tensor4DCoord input_tensor_size,
-                  cutlass::Tensor4DCoord output_tensor_size,
-                  TensorRef<T, layout::TensorNCHW> ref_input,
-                  TensorRef<T, layout::TensorNHWC> ref_output,
-                  cudaStream_t stream);
-template <typename T>
-__global__ void nchw_to_nhwc_kernel(T *output,
-                                    const T *input,
-                                    const int n,
-                                    const int h,
-                                    const int w,
-                                    const int c) {
-  const int hw = h*w;
-  const int chw = c*hw;
-  __shared__ T shbuf[32 * (32 + 1)];
-  const int32_t tid  = threadIdx.y*blockDim.x + threadIdx.x;
-  const int32_t wid  = tid / 32;
-  const int32_t lid  = tid % 32;
-  const int32_t ni   = blockIdx.z;
-  const int32_t ci0  = blockIdx.y * 32;
-  const int32_t hwi0 = blockIdx.x * 32;
-  const size_t input_idx = ni * chw + (ci0 + wid) * hw + hwi0;
-  const T *A = input + input_idx;
-  if (hwi0 + lid < hw) {
-    const int lid_x_33 = lid * 33;
-    if ((ci0 + 32) <= c) {
-      int ci = wid;  // between 0 and 7
-      CUTLASS_PRAGMA_UNROLL
-      for (int cLoopIdx = 0; cLoopIdx < 4; cLoopIdx++) {
-        shbuf[lid_x_33 + ci] = A[lid];
-        A                    = &A[8 * hw];
-        ci += 8;
-      }
-    } else {
-      for (int ci = wid; ci < 32; ci += 8) {
-        if ((ci + ci0) < c) {
-          shbuf[lid_x_33 + ci] = A[lid];
-        }
-        A = &A[8 * hw];
-      }
-    }
-  }
-  __syncthreads();
-  const int32_t ciOut = ci0 + lid;
-  output = &output[ni * chw + ciOut];
-  if (ciOut < c) {
-    if (hwi0 + 32 < hw) {
-      int hwI = wid;
-      CUTLASS_PRAGMA_UNROLL
-      for (int hwLoopIdx = 0; hwLoopIdx < 4; ++hwLoopIdx) {
-        output[(hwi0 + hwI) * c] = shbuf[(hwI)*33 + lid];
-        hwI += 8;
-      }
-    } else {
-      for (int hwI = wid; hwI < 32; hwI += 8) {
-        if (hwi0 + hwI < hw) {
-          output[(hwi0 + hwI) * c] = shbuf[(hwI)*33 + lid];
-        }
-      }
-    }
-  }
-}
-template <typename T>
-void nchw_to_nhwc(cutlass::Tensor4DCoord input_tensor_size,
-                  cutlass::Tensor4DCoord output_tensor_size,
-                  TensorRef<T, layout::TensorNCHW> ref_input,
-                  TensorRef<T, layout::TensorNHWC> ref_output,
-                  cudaStream_t stream) {
-  assert(
-    input_tensor_size.n() == output_tensor_size.n() &&
-    input_tensor_size.c() == output_tensor_size.h() &&
-    input_tensor_size.h() == output_tensor_size.w() &&
-    input_tensor_size.w() == output_tensor_size.c());
-  int n = output_tensor_size.n();
-  int h = output_tensor_size.h();
-  int w = output_tensor_size.w();
-  int c = output_tensor_size.c();
-  dim3 grid((h*w + 31)/32, (c + 31)/32, n);
-  dim3 block(32, 8);
-  nchw_to_nhwc_kernel<<<grid, block, 0, stream>>>(ref_output.data(), ref_input.data(),
-                                                  n, h, w, c);
-}
-} //namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_padding.h DELETED Viewed

@@ -1,276 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-/**
- * \file
- * \brief cuda kernels for padding in device memory with NHWC layout.
- */
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/tensor_ref.h"
-namespace cutlass {
-/** \brief interface for padding in a device memory tensor with NHWC layout
- * \tparam T: data type
- */
-template <typename T>
-void nhwc_padding(cutlass::Tensor4DCoord input_tensor_size,
-                  cutlass::Tensor4DCoord output_tensor_size,
-                  TensorRef<T, layout::TensorNHWC> ref_input,
-                  TensorRef<T, layout::TensorNHWC> ref_output,
-                  cudaStream_t stream);
-template <typename T>
-__global__ void nhwc_padding_kernel(const int32_t n,
-                                    const int32_t h,
-                                    const int32_t w,
-                                    const int32_t c_in,
-                                    const int32_t c_out,
-                                    const T zero,
-                                    const T *input,
-                                    T *output){
-  const int32_t idx_jump       = blockDim.x * gridDim.x;
-  const int32_t total_elements = n * h * w * c_out;
-  int32_t c_idx, w_idx, h_idx, n_idx, resudial;
-  T value;
-  for (int32_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_elements; idx += idx_jump) {
-    c_idx = idx%c_out;
-    if (c_idx >= c_in){
-      value = zero;
-    }
-    else{
-      resudial = idx/c_out;
-      w_idx = resudial%w;
-      resudial = resudial/w;
-      h_idx = resudial%h;
-      n_idx = resudial/h;
-      resudial = ((n_idx * h + h_idx) * w + w_idx) * c_in + c_idx;
-      value = input[resudial];
-    }
-    output[idx] = value;
-  }
-}
-// fast kernel for c_in = 3 & c_out = 4
-template <typename Tio, typename Telement, int element_in_Tio>
-__global__ void nhwc_padding_channel_3To4_kernel(const int32_t n,
-                                                 const int32_t h,
-                                                 const int32_t w,
-                                                 const Tio *input,
-                                                 Tio *output,
-                                                 const int32_t max_output_element,
-                                                 const int32_t max_input_element,
-                                                 const Tio zero_io,
-                                                 const Telement zero_element){
-  __shared__ Tio shm[192];
-  const int tidx = blockIdx.x * 192 + threadIdx.x;
-  const int threadidx = threadIdx.x;
-  shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx];
-  __syncthreads();
-  const int output_offset = blockIdx.x * 256;
-  const int lower_bound = max_output_element < output_offset + 256 ? max_output_element : output_offset + 256;
-  for (int i = output_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
-  {
-    const Telement* shm_element = (const Telement*)shm + j*3*element_in_Tio/4;
-    Telement array[element_in_Tio];
-    CUTLASS_PRAGMA_UNROLL
-    for (int k = 0 ; k < element_in_Tio ; k++)
-      array[k] = ((k+1)%4 == 0) ? zero_element : shm_element[(k > 3) ? (k - 1) : k];
-    output[i] = *((const Tio *)array);
-  }
-}
-// fast kernel for c_in = 3 & c_out = 8
-template <typename Tio, typename Telement, int element_in_Tio>
-__global__ void nhwc_padding_channel_3To8_kernel(const int32_t n,
-                                                 const int32_t h,
-                                                 const int32_t w,
-                                                 const Tio *input,
-                                                 Tio *output,
-                                                 const int32_t max_output_element,
-                                                 const int32_t max_input_element,
-                                                 const Tio zero_io,
-                                                 const Telement zero_element){
-  __shared__ Tio shm[192];
-  const int tidx = blockIdx.x * 192 + threadIdx.x;
-  const int threadidx = threadIdx.x;
-  shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx];
-  __syncthreads();
-  const int output_offset = blockIdx.x * 512;
-  const int lower_bound = max_output_element < output_offset + 512 ? max_output_element : output_offset + 512;
-  for (int i = output_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
-  {
-    const Telement* shm_element = (const Telement*)shm + (element_in_Tio == 4 ? j/2 : j)*3;
-    Telement array[element_in_Tio];
-    //float
-    if (element_in_Tio == 4){
-      CUTLASS_PRAGMA_UNROLL
-      for (int k = 0 ; k < element_in_Tio ; k++)
-        array[k] = ((j % 2) == 1) ? zero_element : ((k >= 3) ? zero_element : shm_element[k]);
-    }
-    //half
-    else{
-      CUTLASS_PRAGMA_UNROLL
-      for (int k = 0 ; k < element_in_Tio ; k++)
-        array[k] = (k >= 3) ? zero_element : shm_element[k];
-    }
-    output[i] = *((const Tio *)array);
-  }
-}
-template <typename T>
-void nhwc_padding(cutlass::Tensor4DCoord input_tensor_size,
-                  cutlass::Tensor4DCoord output_tensor_size,
-                  TensorRef<T, layout::TensorNHWC> ref_input,
-                  TensorRef<T, layout::TensorNHWC> ref_output,
-                  cudaStream_t stream){
-  assert(
-    input_tensor_size.n() == output_tensor_size.n() &&
-    input_tensor_size.h() == output_tensor_size.h() &&
-    input_tensor_size.w() == output_tensor_size.w() &&
-    input_tensor_size.c() <= output_tensor_size.c());
-  int n = input_tensor_size.n();
-  int h = input_tensor_size.h();
-  int w = input_tensor_size.w();
-  int c_in = input_tensor_size.c();
-  int c_out = output_tensor_size.c();
-  //case 1 : channel == 3 padding to 4 or 8
-  if ((c_out == 4 || c_out == 8) && c_in == 3 && (n*h*w % 8 == 0)){
-    dim3 block(192);
-    const int nhw = n*h*w;
-    const int nhwc = nhw*c_in;
-    //for half_t
-    if (cutlass::sizeof_bits<T>::value == 16){
-      const int element_in_Tio = 8;
-      const int max_input_element = nhwc/element_in_Tio;
-      const int max_output_element = nhw*c_out/element_in_Tio;
-      const int4 zero_io = {0, 0, 0, 0};
-      const half_t zero_element = static_cast<half_t>(0.0f);
-      dim3 grid((nhwc + 192*element_in_Tio - 1)/(192*element_in_Tio));
-      if (c_out == 4){
-        nhwc_padding_channel_3To4_kernel<int4, half_t, element_in_Tio><<<grid, block, 0, stream>>>
-          (n, h, w,
-          (const int4 *)ref_input.data(),
-          (int4 *)ref_output.data(),
-          max_output_element,
-          max_input_element,
-          zero_io,
-          zero_element);
-      }
-      else if (c_out == 8){
-        nhwc_padding_channel_3To8_kernel<int4, half_t, element_in_Tio><<<grid, block, 0, stream>>>
-          (n, h, w,
-          (const int4 *)ref_input.data(),
-          (int4 *)ref_output.data(),
-          max_output_element,
-          max_input_element,
-          zero_io,
-          zero_element);
-      }
-    }
-    //for float
-    else{
-      const int element_in_Tio = 4;
-      const int max_input_element = nhwc/element_in_Tio;
-      const int max_output_element = nhw*c_out/element_in_Tio;
-      const float4 zero_io = {0.0f, 0.0f, 0.0f, 0.0f};
-      const float zero_element = 0.0f;
-      dim3 grid((nhwc + 192*element_in_Tio - 1)/(192*element_in_Tio));
-      if (c_out == 4){
-        nhwc_padding_channel_3To4_kernel<float4, float, element_in_Tio><<<grid, block, 0, stream>>>
-          (n, h, w,
-          (const float4 *)ref_input.data(),
-          (float4 *)ref_output.data(),
-          max_output_element,
-          max_input_element,
-          zero_io,
-          zero_element);
-      }
-      else if (c_out == 8){
-        nhwc_padding_channel_3To8_kernel<float4, float, element_in_Tio><<<grid, block, 0, stream>>>
-          (n, h, w,
-          (const float4 *)ref_input.data(),
-          (float4 *)ref_output.data(),
-          max_output_element,
-          max_input_element,
-          zero_io,
-          zero_element);
-      }
-    }
-  }
-  //case 2 : even channel
-  else if ((c_out % 2) == 0 && (c_in % 2) == 0){
-    int32_t total_elements = n * h * w * c_out / 2;
-    int block_size = 256;
-    dim3 grid((total_elements + 255)/256);
-    dim3 block(block_size);
-    //for half_t
-    if (cutlass::sizeof_bits<T>::value == 16){
-      const __half2 zero  = {0.0f, 0.0f};
-      nhwc_padding_kernel<<<grid, block, 0, stream>>>(n, h, w, c_in/2, c_out/2, zero, (const __half2*)ref_input.data(), (__half2*)ref_output.data());
-    }
-    //for float
-    else{
-      const float2 zero  = {0.0f, 0.0f};
-      nhwc_padding_kernel<<<grid, block, 0, stream>>>(n, h, w, c_in/2, c_out/2, zero, (const float2*)ref_input.data(), (float2*)ref_output.data());
-    }
-  }
-  //case 3 : odd channel
-  else{
-    int32_t total_elements = n * h * w * c_out;
-    int block_size = 256;
-    dim3 grid((total_elements + 255)/256);
-    dim3 block(block_size);
-    const T zero = static_cast<T>(0.0f);
-    nhwc_padding_kernel<<<grid, block, 0, stream>>>(n, h, w, c_in, c_out, zero, ref_input.data(), ref_output.data());
-  }
-}
-} //namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_pooling.h DELETED Viewed

@@ -1,573 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-/**
- * \file
- * \brief cuda kernels to do avg/max pooling on a device memory tensor with NHWC layout.
- */
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/tensor_ref.h"
-#include "device_utils.h"
-#include <cfloat>
-namespace cutlass {
-/** \brief interface to do avg/max pooling on a device memory tensor with NHWC layout.
- * \tparam T: data type
- */
-template <typename T>
-void pooling_nhwc(cutlass::Tensor4DCoord input_tensor_size,
-                  cutlass::Tensor4DCoord filter_tensor_size,
-                  cutlass::Tensor4DCoord output_tensor_size,
-                  cutlass::MatrixCoord padding,
-                  cutlass::MatrixCoord stride,
-                  TensorRef<T, layout::TensorNHWC> ref_input,
-                  TensorRef<T, layout::TensorNHWC> ref_output,
-                  int poolingType, //0 for avg pooling ; 1 for max pooling
-                  cudaStream_t stream);
-/** get the output size of pooling
- */
-inline int getOutputSize(int H_W, int padding, int kernel_size, int stride)
-{
-    return (H_W + 2 * padding - kernel_size) / stride + 1;
-}
-/**
- * input is [N, H, W, C]
- * assume stride == kernel_size
- * output_h = (H + 2*padding_H - kernel_H)/stride_H
- * output_w = (W + 2*padding_W - kernel_W)/stride_W
- * output is [N, output_h, output_w, C]
- * grid(N, output_h, output_w)
- * block(min(C, 256)) :
- * each block deals with C elements of output when each thread deals with ((C + 255)/256 element of output)
-*/
-template<typename T, bool IS_AVG_POOLING>
-__global__ void pooling_nhwc_element1_kernel(T* output,
-                                             const T* input,
-                                             const int N,
-                                             const int H,
-                                             const int W,
-                                             const int C,
-                                             const int output_H,
-                                             const int output_W,
-                                             const int kernel_H,
-                                             const int kernel_W,
-                                             const int stride_H,
-                                             const int stride_W,
-                                             const int padding_H,
-                                             const int padding_W)
-{
-  const int tid = threadIdx.x;
-  const int n_idx = blockIdx.x;
-  const int output_h_idx = blockIdx.y;
-  const int output_w_idx = blockIdx.z;
-  int h_start_idx = output_h_idx * stride_H - padding_H;
-  int h_end_idx = h_start_idx + kernel_H;
-  h_start_idx = (h_start_idx < 0) ? 0 : h_start_idx;
-  h_end_idx = h_end_idx > H ? H : h_end_idx;
-  int w_start_idx = output_w_idx * stride_W - padding_W;
-  int w_end_idx = w_start_idx + kernel_W;
-  w_start_idx = (w_start_idx < 0) ? 0 : w_start_idx;
-  w_end_idx = w_end_idx > W ? W : w_end_idx;
-  input += n_idx * H * W * C;
-  output += ((n_idx * output_H + output_h_idx) * output_W + output_w_idx) * C;
-  const int kernel_size2 = kernel_H * kernel_W;
-  for (int c_idx = tid; c_idx < C; c_idx += blockDim.x) {
-    float pooling;
-    if (IS_AVG_POOLING){
-      pooling = 0.0f;
-    }
-    else{
-      pooling = -FLT_MAX;
-    }
-    for (int h = h_start_idx; h < h_end_idx; h++) {
-      for (int w = w_start_idx; w < w_end_idx; w++) {
-        const int idx = (h * W + w) * C;
-        const float tmp = static_cast<float>(input[idx + c_idx]);
-        if (IS_AVG_POOLING){
-          pooling = pooling + tmp;
-        }
-        else{
-          pooling = pooling > tmp ? pooling : tmp;
-        }
-      }
-    }
-    T output_val;
-    if (IS_AVG_POOLING){
-      output_val = T(pooling/kernel_size2);
-    }
-    else{
-      output_val = T(pooling);
-    }
-    output[c_idx] = output_val;
-  }
-}
-template<typename T2, typename T, bool IS_AVG_POOLING>
-__global__ void pooling_nhwc_element2_kernel(T2* output,
-                                             const T2* input,
-                                             const int N,
-                                             const int H,
-                                             const int W,
-                                             const int C,
-                                             const int output_H,
-                                             const int output_W,
-                                             const int kernel_H,
-                                             const int kernel_W,
-                                             const int stride_H,
-                                             const int stride_W,
-                                             const int padding_H,
-                                             const int padding_W)
-{
-  const int tid = threadIdx.x;
-  const int n_idx = blockIdx.x;
-  const int output_h_idx = blockIdx.y;
-  const int output_w_idx = blockIdx.z;
-  int h_start_idx = output_h_idx * stride_H - padding_H;
-  int h_end_idx = h_start_idx + kernel_H;
-  h_start_idx = (h_start_idx < 0) ? 0 : h_start_idx;
-  h_end_idx = h_end_idx > H ? H : h_end_idx;
-  int w_start_idx = output_w_idx * stride_W - padding_W;
-  int w_end_idx = w_start_idx + kernel_W;
-  w_start_idx = (w_start_idx < 0) ? 0 : w_start_idx;
-  w_end_idx = w_end_idx > W ? W : w_end_idx;
-  input += n_idx * H * W * C;
-  output += ((n_idx * output_H + output_h_idx) * output_W + output_w_idx) * C;
-  const int kernel_size2 = kernel_H * kernel_W;
-  for (int c_idx = tid; c_idx < C; c_idx += blockDim.x) {
-    float2 pooling;
-    if (IS_AVG_POOLING) {
-      pooling = {0.0f, 0.0f};
-    }
-    else {
-      pooling = {-FLT_MAX, -FLT_MAX};
-    }
-    for (int h = h_start_idx; h < h_end_idx; h++) {
-      for (int w = w_start_idx; w < w_end_idx; w++) {
-        const int idx = (h * W + w) * C;
-        const T2 tmp = input[idx + c_idx];
-        const float2 tmp_flt2 = {static_cast<float>(tmp.x), static_cast<float>(tmp.y)};
-        if (IS_AVG_POOLING) {
-          pooling.x += tmp_flt2.x;
-          pooling.y += tmp_flt2.y;
-        }
-        else {
-          pooling.x = pooling.x > tmp_flt2.x ? pooling.x : tmp_flt2.x;
-          pooling.y = pooling.y > tmp_flt2.y ? pooling.y : tmp_flt2.y;
-        }
-      }
-    }
-    T2 output_val;
-    if (IS_AVG_POOLING) {
-      output_val.x = T(pooling.x/kernel_size2);
-      output_val.y = T(pooling.y/kernel_size2);
-    }
-    else {
-      output_val.x = T(pooling.x);
-      output_val.y = T(pooling.y);
-    }
-    output[c_idx] = output_val;
-  }
-}
-/**
- * output [N, 1, 1, C]
- * input [N, H, W, C]
- * grid(C, N)
- * block(block_size) -- each block deals with H*W/block_size elements;
-*/
-template<typename T, bool IS_AVG_POOLING>
-__global__ void pooling_nxhTo1x1_element1_kernel(
-    T* output, const T* input, const int N, const int HW, const int C)
-{
-    const int c_idx = blockIdx.x;
-    const int n_idx = blockIdx.y;
-    float pooling[1];
-    if (IS_AVG_POOLING) {
-      pooling[0] = 0.0f;
-    }
-    else {
-      pooling[0] = -FLT_MAX;
-    }
-    const size_t input_offset = n_idx * HW * C + c_idx;
-    input += input_offset;
-    const size_t output_offset = n_idx * C + c_idx;
-    output += output_offset;
-    int tid = threadIdx.x;
-    for (int index = tid; index < HW; index += blockDim.x) {
-        float val = static_cast<float>(input[index * C]);
-        if (IS_AVG_POOLING) {
-          pooling[0] += val;
-        }
-        else {
-          pooling[0] = pooling[0] > val ? pooling[0] : val;
-        }
-    }
-    if (blockDim.x <= 32) {
-        if (IS_AVG_POOLING) {
-          warpReduceSum<float, 1>(pooling);
-        }
-        else {
-          warpReduceMax<float, 1>(pooling);
-        }
-    }
-    else {
-        if (IS_AVG_POOLING) {
-          blockReduceSum<float, 1>(pooling);
-        }
-        else {
-          blockReduceMax<float, 1>(pooling);
-        }
-    }
-    __syncthreads();
-    if (threadIdx.x == 0) {
-        T output_val;
-        if (IS_AVG_POOLING) {
-          output_val = T(pooling[0] / HW);
-        }
-        else {
-          output_val = T(pooling[0]);
-        }
-        output[0] = output_val;
-    }
-}
-/**
- * output [N, 1, 1, C]
- * input [N, H, W, C]
- * grid(C/2, N)
- * block(block_size) -- each thread deals with H*W/block_size * 2 elements;
-*/
-template<typename T2, typename T, bool IS_AVG_POOLING>
-__global__ void pooling_nxhTo1x1_element2_kernel(
-    T2* output, const T2* input, const int N, const int HW, const int C)
-{
-    const int c_idx = blockIdx.x;
-    const int n_idx = blockIdx.y;
-    float pooling[2];
-    if (IS_AVG_POOLING) {
-      pooling[0] = pooling[1] = 0.0f;
-    }
-    else {
-      pooling[0] = pooling[1] = -FLT_MAX;
-    }
-    const int C_2 = C / 2;
-    const size_t input_offset = n_idx * HW * C_2 + c_idx;
-    input += input_offset;
-    const size_t output_offset = n_idx * C_2 + c_idx;
-    output += output_offset;
-    int tid = threadIdx.x;
-    for (int index = tid; index < HW; index += blockDim.x) {
-        T2 val = input[index * C_2];
-        float2 val_flt2 = {static_cast<float>(val.x), static_cast<float>(val.y)};
-        if (IS_AVG_POOLING) {
-          pooling[0] += val_flt2.x;
-          pooling[1] += val_flt2.y;
-        }
-        else {
-          pooling[0] = pooling[0] > val_flt2.x ? pooling[0] : val_flt2.x;
-          pooling[1] = pooling[1] > val_flt2.y ? pooling[1] : val_flt2.y;
-        }
-    }
-    if (blockDim.x <= 32) {
-        if (IS_AVG_POOLING) {
-          warpReduceSum<float, 2>(pooling);
-        }
-        else {
-          warpReduceMax<float, 2>(pooling);
-        }
-    }
-    else {
-        if (IS_AVG_POOLING) {
-          blockReduceSum<float, 2>(pooling);
-        }
-        else {
-          blockReduceMax<float, 2>(pooling);
-        }
-    }
-    __syncthreads();
-    if (threadIdx.x == 0) {
-        T2 output_val;
-        if (IS_AVG_POOLING) {
-          output_val.x = T(pooling[0] / HW);
-          output_val.y = T(pooling[1] / HW);
-        }
-        else {
-          output_val.x = T(pooling[0]);
-          output_val.y = T(pooling[1]);
-        }
-        output[0] = output_val;
-    }
-}
-template <typename T>
-void pooling_nhwc(cutlass::Tensor4DCoord input_tensor_size,
-                  cutlass::Tensor4DCoord filter_tensor_size,
-                  cutlass::Tensor4DCoord output_tensor_size,
-                  cutlass::Tensor4DCoord padding,
-                  cutlass::MatrixCoord stride,
-                  TensorRef<T, layout::TensorNHWC> ref_input,
-                  TensorRef<T, layout::TensorNHWC> ref_output,
-                  int poolingType, //0 for avg pooling ; 1 for max pooling
-                  cudaStream_t stream) {
-  assert(input_tensor_size.n() == output_tensor_size.n() &&
-         input_tensor_size.c() == output_tensor_size.c());
-  const int N = input_tensor_size.n();
-  const int H = input_tensor_size.h();
-  const int W = input_tensor_size.w();
-  const int C = input_tensor_size.c();
-  const int padding_H = padding.h();
-  const int padding_W = padding.w();
-  const int kernel_H = filter_tensor_size.h();
-  const int kernel_W = filter_tensor_size.w();
-  const int stride_H = stride.row();
-  const int stride_W = stride.column();
-  const int output_H = getOutputSize(H, padding_H, kernel_H, stride_H);
-  const int output_W = getOutputSize(W, padding_W, kernel_W, stride_W);
-  assert(output_tensor_size.h() == output_H &&
-         output_tensor_size.w() == output_W);
-  if (C % 2 != 0) {
-    if ((H == kernel_H && padding_H == 0) && (W == kernel_W && padding_W == 0)) {
-      dim3 grid(C, N);
-      dim3 block(256);
-      if (H*W < block.x){
-        block.x = (H*W + 31)/32*32;
-      }
-      if (poolingType == 0) {
-        pooling_nxhTo1x1_element1_kernel<T, true><<<grid, block, 0, stream>>>(
-          ref_output.data(),
-          ref_input.data(),
-          N,
-          H*W,
-          C);
-      } // if (poolingType == 0)
-      else {
-        pooling_nxhTo1x1_element1_kernel<T, false><<<grid, block, 0, stream>>>(
-          ref_output.data(),
-          ref_input.data(),
-          N,
-          H*W,
-          C);
-      }
-    } // if ((H == kernel_H && padding_H == 0) && (W == kernel_W && padding_W == 0))
-    else {
-      dim3 grid(N, output_H, output_W);
-      dim3 block(256);
-      if (C < block.x) {
-        block.x = C;
-      }
-      if (poolingType == 0) {
-        pooling_nhwc_element1_kernel<T, true><<<grid, block, 0, stream>>>(
-          ref_output.data(),
-          ref_input.data(),
-          N,
-          H,
-          W,
-          C,
-          output_H,
-          output_W,
-          kernel_H,
-          kernel_W,
-          stride_H,
-          stride_W,
-          padding_H,
-          padding_W);
-      } // if (poolingType == 0)
-      else {
-        pooling_nhwc_element1_kernel<T, false><<<grid, block, 0, stream>>>(
-          ref_output.data(),
-          ref_input.data(),
-          N,
-          H,
-          W,
-          C,
-          output_H,
-          output_W,
-          kernel_H,
-          kernel_W,
-          stride_H,
-          stride_W,
-          padding_H,
-          padding_W);
-      }
-    }
-  } // if (C % 2 != 0))
-  else {
-    if ((H == kernel_H && padding_H == 0) && (W == kernel_W && padding_W == 0)) {
-      dim3 grid(C/2, N);
-      dim3 block(256);
-      if (H*W < block.x){
-        block.x = (H*W + 31)/32*32;
-      }
-      if (poolingType == 0) {
-        if (std::is_same<T, float>::value) {
-          pooling_nxhTo1x1_element2_kernel<float2, float, true><<<grid, block, 0, stream>>>(
-            (float2*)(ref_output.data()),
-            (const float2*)(ref_input.data()),
-            N,
-            H*W,
-            C);
-        } // if (std::is_same<T, float>::value)
-        else {
-          pooling_nxhTo1x1_element2_kernel<half2, half, true><<<grid, block, 0, stream>>>(
-            (half2*)(ref_output.data()),
-            (const half2*)(ref_input.data()),
-            N,
-            H*W,
-            C);
-        }
-      } // if (poolingType == 0)
-      else {
-        if (std::is_same<T, float>::value) {
-          pooling_nxhTo1x1_element2_kernel<float2, float, false><<<grid, block, 0, stream>>>(
-            (float2*)(ref_output.data()),
-            (const float2*)(ref_input.data()),
-            N,
-            H*W,
-            C);
-        } // if (std::is_same<T, float>::value)
-        else {
-          pooling_nxhTo1x1_element2_kernel<half2, half, false><<<grid, block, 0, stream>>>(
-            (half2*)(ref_output.data()),
-            (const half2*)(ref_input.data()),
-            N,
-            H*W,
-            C);
-        }
-      }
-    } // if ((H == kernel_H && padding_H == 0) && (W == kernel_W && padding_W == 0))
-    else {
-      dim3 grid(N, output_H, output_W);
-      dim3 block(256);
-      if (C/2 < block.x) {
-        block.x = C/2;
-      }
-      if (poolingType == 0) {
-        if (std::is_same<T, float>::value) {
-          pooling_nhwc_element2_kernel<float2, float, true><<<grid, block, 0, stream>>>(
-            (float2*)(ref_output.data()),
-            (const float2*)(ref_input.data()),
-            N,
-            H,
-            W,
-            C/2,
-            output_H,
-            output_W,
-            kernel_H,
-            kernel_W,
-            stride_H,
-            stride_W,
-            padding_H,
-            padding_W);
-        } // if (std::is_same<T, float>::value)
-        else {
-          pooling_nhwc_element2_kernel<half2, half, true><<<grid, block, 0, stream>>>(
-            (half2*)(ref_output.data()),
-            (const half2*)(ref_input.data()),
-            N,
-            H,
-            W,
-            C/2,
-            output_H,
-            output_W,
-            kernel_H,
-            kernel_W,
-            stride_H,
-            stride_W,
-            padding_H,
-            padding_W);
-        }
-      } // if (poolingType == 0)
-      else {
-        if (std::is_same<T, float>::value) {
-          pooling_nhwc_element2_kernel<float2, float, false><<<grid, block, 0, stream>>>(
-            (float2*)(ref_output.data()),
-            (const float2*)(ref_input.data()),
-            N,
-            H,
-            W,
-            C/2,
-            output_H,
-            output_W,
-            kernel_H,
-            kernel_W,
-            stride_H,
-            stride_W,
-            padding_H,
-            padding_W);
-        } // if (std::is_same<T, float>::value)
-        else {
-          pooling_nhwc_element2_kernel<half2, half, false><<<grid, block, 0, stream>>>(
-            (half2*)(ref_output.data()),
-            (const half2*)(ref_input.data()),
-            N,
-            H,
-            W,
-            C/2,
-            output_H,
-            output_W,
-            kernel_H,
-            kernel_W,
-            stride_H,
-            stride_W,
-            padding_H,
-            padding_W);
-        }
-      }
-    }
-  }
-}
-} //namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_to_nchw.h DELETED Viewed

@@ -1,144 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-/**
- * \file
- * \brief cuda kernels to transform a device memory tensor from NHWC layout to NCHW layout.
- */
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/tensor_ref.h"
-namespace cutlass {
-/** \brief interface to transform a device memory tensor from NHWC layout to NCHW layout.
- * \tparam T: data type
- */
-template <typename T>
-void nhwc_to_nchw(cutlass::Tensor4DCoord input_tensor_size,
-                  cutlass::Tensor4DCoord output_tensor_size,
-                  TensorRef<T, layout::TensorNHWC> ref_input,
-                  TensorRef<T, layout::TensorNCHW> ref_output,
-                  cudaStream_t stream);
-template <typename T>
-__global__ void nhwc_to_nchw_kernel(T *output,
-                                    const T *input,
-                                    const int n,
-                                    const int h,
-                                    const int w,
-                                    const int c) {
-  const int hw = h*w;
-  const int hwc = hw*c;
-  __shared__ T shbuf[32 * (32 + 1)];
-  const int32_t tid  = threadIdx.y*blockDim.x + threadIdx.x;
-  const int32_t wid  = tid / 32;
-  const int32_t lid  = tid % 32;
-  const int32_t ni   = blockIdx.z;
-  const int32_t hwi0  = blockIdx.y * 32;
-  const int32_t ci0 = blockIdx.x * 32;
-  const size_t input_idx = ni * hwc + (hwi0 + wid) * c + ci0;
-  const T *A = input + input_idx;
-  if (ci0 + lid < c) {
-    const int lid_x_33 = lid * 33;
-    if ((hwi0 + 32) <= hw) {
-      int hwi = wid;  // between 0 and 7
-      CUTLASS_PRAGMA_UNROLL
-      for (int cLoopIdx = 0; cLoopIdx < 4; cLoopIdx++) {
-        shbuf[lid_x_33 + hwi] = A[lid];
-        A                     = &A[8 * c];
-        hwi += 8;
-      }
-    } else {
-      for (int hwi = wid; hwi < 32; hwi += 8) {
-        if ((hwi + hwi0) < hw) {
-          shbuf[lid_x_33 + hwi] = A[lid];
-        }
-        A = &A[8 * c];
-      }
-    }
-  }
-  __syncthreads();
-  const int32_t hwiOut = hwi0 + lid;
-  output = &output[ni * hwc + hwiOut];
-  if (hwiOut < hw) {
-    if (ci0 + 32 < c) {
-      int cI = wid;
-      CUTLASS_PRAGMA_UNROLL
-      for (int hwLoopIdx = 0; hwLoopIdx < 4; ++hwLoopIdx) {
-        output[(ci0 + cI) * hw] = shbuf[(cI)*33 + lid];
-        cI += 8;
-      }
-    } else {
-      for (int cI = wid; cI < 32; cI += 8) {
-        if (ci0 + cI < c) {
-          output[(ci0 + cI) * hw] = shbuf[(cI)*33 + lid];
-        }
-      }
-    }
-  }
-}
-template <typename T>
-void nhwc_to_nchw(cutlass::Tensor4DCoord input_tensor_size,
-                  cutlass::Tensor4DCoord output_tensor_size,
-                  TensorRef<T, layout::TensorNHWC> ref_input,
-                  TensorRef<T, layout::TensorNCHW> ref_output,
-                  cudaStream_t stream) {
-  assert(
-    input_tensor_size.n() == output_tensor_size.n() &&
-    input_tensor_size.h() == output_tensor_size.c() &&
-    input_tensor_size.w() == output_tensor_size.h() &&
-    input_tensor_size.c() == output_tensor_size.w());
-  int n = input_tensor_size.n();
-  int h = input_tensor_size.h();
-  int w = input_tensor_size.w();
-  int c = input_tensor_size.c();
-  dim3 grid((c + 31)/32, (h*w + 31)/32, n);
-  dim3 block(32, 8);
-  nhwc_to_nchw_kernel<<<grid, block, 0, stream>>>(ref_output.data(), ref_input.data(),
-                                                  n, h, w, c);
-}
-} //namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_rmsnorm.h DELETED Viewed

@@ -1,186 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_coord.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/util/device_utils.h"
-#include <cfloat>
-namespace cutlass {
-__global__ void rmsnorm_twoPassAlgo_e8(float4 *output, const float4 *input,
-                                       const float4 *weight,
-                                       const int m, const int n, float epsilon) {
-  const int m_idx = blockIdx.x;
-  const int tid = threadIdx.x;
-  const int bdimx = blockDim.x;
-  __shared__ float s_mean;
-  float local_sums[1] = {0.0f};
-  const int n_8 = n / 8;
-  int offset = m_idx * n_8;
-  input += offset;
-  output += offset;
-  for (int index = tid; index < n_8; index += bdimx) {
-    const float4 local_val = input[index];
-    const half2 *h1 = (half2 *)&local_val.x;
-    const half2 *h2 = (half2 *)&local_val.y;
-    const half2 *h3 = (half2 *)&local_val.z;
-    const half2 *h4 = (half2 *)&local_val.w;
-    local_sums[0] += static_cast<float>(h1->x) * static_cast<float>(h1->x) +
-                     static_cast<float>(h1->y) * static_cast<float>(h1->y) +
-                     static_cast<float>(h2->x) * static_cast<float>(h2->x) +
-                     static_cast<float>(h2->y) * static_cast<float>(h2->y) +
-                     static_cast<float>(h3->x) * static_cast<float>(h3->x) +
-                     static_cast<float>(h3->y) * static_cast<float>(h3->y) +
-                     static_cast<float>(h4->x) * static_cast<float>(h4->x) +
-                     static_cast<float>(h4->y) * static_cast<float>(h4->y);
-  }
-  if (blockDim.x <= 32) {
-    warpReduceSum<float, 1>(local_sums);
-  } else {
-    blockReduceSum<float, 1>(local_sums);
-  }
-  if (threadIdx.x == 0) {
-    s_mean = rsqrtf(local_sums[0] / n + epsilon);
-  }
-  __syncthreads();
-  for (int index = tid; index < n_8; index += bdimx) {
-    const float4 local_val = input[index];
-    const float4 weight_val = weight[index];
-    const half2 *l1 = (half2 *)&local_val.x;
-    const half2 *l2 = (half2 *)&local_val.y;
-    const half2 *l3 = (half2 *)&local_val.z;
-    const half2 *l4 = (half2 *)&local_val.w;
-    const half2 *g1 = (half2 *)&weight_val.x;
-    const half2 *g2 = (half2 *)&weight_val.y;
-    const half2 *g3 = (half2 *)&weight_val.z;
-    const half2 *g4 = (half2 *)&weight_val.w;
-    float4 tmp;
-    half2 *h1 = (half2 *)&tmp.x;
-    half2 *h2 = (half2 *)&tmp.y;
-    half2 *h3 = (half2 *)&tmp.z;
-    half2 *h4 = (half2 *)&tmp.w;
-    h1->x = half(static_cast<float>(l1->x) * s_mean * static_cast<float>(g1->x));
-    h1->y = half(static_cast<float>(l1->y) * s_mean * static_cast<float>(g1->y));
-    h2->x = half(static_cast<float>(l2->x) * s_mean * static_cast<float>(g2->x));
-    h2->y = half(static_cast<float>(l2->y) * s_mean * static_cast<float>(g2->y));
-    h3->x = half(static_cast<float>(l3->x) * s_mean * static_cast<float>(g3->x));
-    h3->y = half(static_cast<float>(l3->y) * s_mean * static_cast<float>(g3->y));
-    h4->x = half(static_cast<float>(l4->x) * s_mean * static_cast<float>(g4->x));
-    h4->y = half(static_cast<float>(l4->y) * s_mean * static_cast<float>(g4->y));
-    output[index] = tmp;
-  }
-}
-template<typename T>
-__global__ void rmsnorm_twoPassAlgo_e1(T* output,
-                                       const T* input,
-                                       const T* weight,
-                                       const int m, const int n,
-                                       float epsilon)
-{
-  const int m_idx = blockIdx.x;
-  const int tid = threadIdx.x;
-  const int bdimx = blockDim.x;
-  __shared__ float s_mean;
-  float local_sums[1] = {0.0f};
-  int offset = m_idx * n;
-  input += offset;
-  output += offset;
-  for (int index = tid ; index < n ; index += bdimx){
-    float local_val = static_cast<float>(input[index]);
-    local_sums[0] += local_val * local_val;
-  }
-  if (blockDim.x <= 32) {
-    warpReduceSum<float, 1>(local_sums);
-  }
-  else {
-    blockReduceSum<float, 1>(local_sums);
-  }
-  if (threadIdx.x == 0) {
-    s_mean = rsqrtf(local_sums[0] / n + epsilon);
-  }
-  __syncthreads();
-  for (int index = tid ; index < n ; index += bdimx){
-    const T weight_val = weight[index];
-    const T local_val = input[index];
-    output[index] = T(static_cast<float>(local_val) * s_mean * static_cast<float>(weight_val));
-  }
-}
-template <typename T>
-void rmsnorm(cutlass::MatrixCoord tensor_size,
-             TensorRef<T, layout::RowMajor> ref_output,
-             TensorRef<T, layout::RowMajor> ref_input,
-             TensorRef<T, layout::RowMajor> ref_weight,
-             cudaStream_t stream, float epsilon = 1e-5f){
-  const int m = tensor_size.row();
-  const int n = tensor_size.column();
-  T* output = ref_output.data();
-  const T* input = ref_input.data();
-  const T* weight = ref_weight.data();
-  dim3 grid(m);
-  if (n % 8 == 0 && std::is_same<T, cutlass::half_t>::value) {
-    dim3 block(cutlass::platform::min(1024, (n / 8 + 31) / 32 * 32));
-    rmsnorm_twoPassAlgo_e8<<<grid, block, 0, stream>>>(
-        (float4 *)output, (const float4 *)input, (const float4 *)weight, m, n, epsilon);
-  } else {
-    dim3 block(cutlass::platform::min(1024, ((n + 31)/32 + 31)/32*32));
-    rmsnorm_twoPassAlgo_e1<<<grid, block, 0, stream>>>(
-        output, input, weight, m, n, epsilon);
-  }
-  auto result = cudaGetLastError();
-  if (result != cudaSuccess) {
-    std::cerr << "CUDA error: " << cudaGetErrorString(result) << std::endl;
-    abort();
-  }
-}
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_utils.h DELETED Viewed

@@ -1,127 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief utils code for device cutlass code
-*/
-#pragma once
-#include <cuda_fp16.h>
-#include <cfloat>
-#define FINAL_MASK 0xffffffff
-struct half4 {
-    half x, y, z, w;
-};
-template<typename T, int NUM>
-__inline__ __device__ T warpReduceSum(T* val)
-{
-#pragma unroll
-    for (int i = 0; i < NUM; i++) {
-#pragma unroll
-        for (int mask = 16; mask > 0; mask >>= 1)
-            val[i] += __shfl_xor_sync(FINAL_MASK, val[i], mask, 32);
-    }
-    return (T)(0.0f);
-}
-template<typename T, int NUM>
-__inline__ __device__ T blockReduceSum(T* val)
-{
-    __shared__ T shared[NUM][33];
-    int lane = threadIdx.x & 0x1f;
-    int wid = threadIdx.x >> 5;
-    warpReduceSum<T, NUM>(val);
-    if (lane == 0) {
-#pragma unroll
-        for (int i = 0; i < NUM; i++) {
-            shared[i][wid] = val[i];
-        }
-    }
-    __syncthreads();
-    bool is_mask = threadIdx.x < (blockDim.x / 32.f);
-#pragma unroll
-    for (int i = 0; i < NUM; i++) {
-        val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
-    }
-    warpReduceSum<T, NUM>(val);
-    return (T)0.0f;
-}
-template<typename T, int NUM>
-__inline__ __device__ T warpReduceMax(T* val)
-{
-#pragma unroll
-    for (int i = 0; i < NUM; i++) {
-#pragma unroll
-        for (int mask = 16; mask > 0; mask >>= 1)
-            val[i] = max(val[i], __shfl_xor_sync(FINAL_MASK, val[i], mask, 32));
-    }
-    return (T)(0.0f);
-}
-template<typename T, int NUM>
-__inline__ __device__ T blockReduceMax(T* val)
-{
-    static __shared__ T shared[32][NUM];
-    int lane = threadIdx.x & 0x1f;  // in-warp idx
-    int wid = threadIdx.x >> 5;     // warp idx
-    warpReduceMax<T, NUM>(val);  // get maxx in each warp
-    if (lane == 0)  // record in-warp maxx by warp Idx
-    {
-#pragma unroll
-        for (int i = 0; i < NUM; i++) {
-            shared[wid][i] = val[i];
-        }
-    }
-    __syncthreads();
-    // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
-    // blockDim.x is not divided by 32
-    bool is_mask = threadIdx.x < (blockDim.x / 32.f);
-#pragma unroll
-    for (int i = 0; i < NUM; i++) {
-        val[i] = is_mask ? shared[lane][i] : (T)(-FLT_MAX);
-    }
-    warpReduceMax<T, NUM>(val);
-    return (T)0.0f;
-}

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/distribution.h DELETED Viewed

@@ -1,157 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-/*! \file
-    \brief This header contains a class to parametrize a statistical distribution function.
-*/
-#include <ostream>
-namespace cutlass {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Distribution type
-struct Distribution {
-  /// Variant types
-  enum Kind { Invalid, Uniform, Gaussian, Identity, Sequential, AllZeros, AllOnes };
-  /// Distribution state
-  union {
-    /// Uniform distribution
-    struct {
-      double min;
-      double max;
-      // Percent elements set to NaN
-      double pnan;
-    } uniform;
-    /// Gaussian distribution
-    struct {
-      double mean;
-      double stddev;
-      double pnz;
-      double pnzA;
-      double pnzB;
-      double pnzC;
-    } gaussian;
-    /// Elements are linear combination of row and column index
-    struct {
-      double start;
-      double delta;
-    } sequential;
-  };
-  /// Active variant kind
-  Kind kind;
-  /// Random values are cast to integer after scaling by this power of two
-  int int_scale;
-  //
-  // Methods
-  //
-  Distribution() : kind(Invalid), int_scale(0) {}
-/// Configures distribution as uniform random
-  Distribution &set_uniform(double _min, double _max, int _int_scale = 0, double _pnan = 0) {
-    kind = Uniform;
-    uniform.min = _min;
-    uniform.max = _max;
-    int_scale = _int_scale;
-    uniform.pnan = _pnan;
-    return *this;
-  }
-  /// Configures distribution as Gaussian distribution
-  Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0, double _pnz = 1.0) {
-    kind = Gaussian;
-    gaussian.mean = _mean;
-    gaussian.stddev = _stddev;
-    gaussian.pnz = _pnz;
-    gaussian.pnzA = _pnz;
-    gaussian.pnzB = _pnz;
-    gaussian.pnzC = _pnz;
-    int_scale = _int_scale;
-    return *this;
-  }
-  /// Sets identity
-  Distribution &set_identity() {
-    kind = Identity;
-    return *this;
-  }
-  /// Sets sequential
-  Distribution &set_sequential(double start, double delta, int _int_scale = 0) {
-    kind = Sequential;
-    sequential.start = start;
-    sequential.delta = delta;
-    int_scale = _int_scale;
-    return *this;
-  }
-};
-}  // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Prints a Distribution to ostream
-inline std::ostream &operator<<(std::ostream &out, cutlass::Distribution const &dist) {
-  switch (dist.kind) {
-    case cutlass::Distribution::Uniform:
-      out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max
-          << ", pnan: " << dist.uniform.pnan;
-      break;
-    case cutlass::Distribution::Gaussian:
-      out << "gaussian, mean: " << dist.gaussian.mean << ", stddev: " << dist.gaussian.stddev
-          << ", pnzA: " << dist.gaussian.pnzA << ", pnzB: "
-          << dist.gaussian.pnzB << ", pnzC: " << dist.gaussian.pnzC;
-      break;
-    case cutlass::Distribution::Identity:
-      out << "identity";
-      break;
-    case cutlass::Distribution::Sequential:
-      out << "sequential";
-      break;
-    default:
-      out << "unknown";
-  }
-  out << ", int_scale: " << dist.int_scale;
-  return out;
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/exceptions.h DELETED Viewed

@@ -1,69 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-/**
- * \file
- * \brief C++ exception semantics for CUDA error codes
- */
-#include <cuda_runtime.h>
-#include <iosfwd>
-#include <stdexcept>
-#include "cutlass/platform/platform.h"
-namespace cutlass {
-/// C++ exception wrapper for CUDA \p cudaError_t
-class cuda_exception : public std::exception {
- public:
-  /// Constructor
-  cuda_exception(const char* msg = "", cudaError_t err = cudaErrorUnknown) : msg(msg), err(err) {}
-  /// Returns the underlying CUDA \p cudaError_t
-  cudaError_t cudaError() const { return err; }
- protected:
-  /// Explanatory string
-  const char* msg;
-  /// Underlying CUDA \p cudaError_t
-  cudaError_t err;
-};
-/// Writes a cuda_exception instance to an output stream
-inline std::ostream& operator<<(std::ostream& out, cuda_exception const& e) {
-  return out << e.what() << ": " << cudaGetErrorString(e.cudaError());
-}
-}  // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/gett_commandline.hpp DELETED Viewed

@@ -1,369 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief GETT command line parser to gather semantic modes, their stride order, and extents.
-*/
-#pragma once
-#include <iostream>
-#include <iomanip>
-#include <utility>
-#include <type_traits>
-#include <vector>
-#include <map>
-#include <algorithm>
-#include <numeric>
-#include "cutlass/util/command_line.h"
-namespace cutlass {
-// Output shortcuts
-std::ostream& operator<<(std::ostream& os, std::vector<char> data) {
-  for (auto& a : data) os << a;
-  return os;
-}
-template <class T>
-std::ostream& operator<<(std::ostream& os, std::vector<T> data) {
-  for (auto& a : data) os << a << " ";
-  return os;
-}
-struct GettCommandLine {
-  struct GettProblem {
-    using extent_type = int;
-    using stride_type = int64_t;
-    // Row modes: appear in A and C/D
-    std::vector<extent_type> M;
-    std::vector<stride_type> ldAm;
-    std::vector<stride_type> ldCm;
-    // Column modes: appear in B and C/D
-    std::vector<extent_type> N;
-    std::vector<stride_type> ldBn;
-    std::vector<stride_type> ldCn;
-    // Reduction modes: appear in A and B
-    std::vector<extent_type> K;
-    std::vector<stride_type> ldAk;
-    std::vector<stride_type> ldBk;
-    // Batch modes: appear in all in/out tensors
-    std::vector<extent_type> L;
-    std::vector<stride_type> ldAl;
-    std::vector<stride_type> ldBl;
-    std::vector<stride_type> ldCl;
-  };
-  static GettProblem
-  parse(int argc, char const* argv[], bool parse_verbose = false) {
-    using extent_type = typename GettProblem::extent_type;
-    using stride_type = typename GettProblem::stride_type;
-    cutlass::CommandLine cmd(argc, argv);
-    // modeA
-    std::vector<char> a_mode;
-    cmd.get_cmd_line_arguments("modeA", a_mode);
-    // modeB
-    std::vector<char> b_mode;
-    cmd.get_cmd_line_arguments("modeB", b_mode);
-    // modeC
-    std::vector<char> c_mode;
-    cmd.get_cmd_line_arguments("modeC", c_mode);
-    // mode_sizes
-    std::map<char,extent_type> mode_size;
-    // First, initialize all modes in a, b, c to make sure they're in map
-    for (char a : a_mode) mode_size[a] = 1;
-    for (char b : b_mode) mode_size[b] = 1;
-    for (char c : c_mode) mode_size[c] = 1;
-    // Then, overwrite the ones in -extent
-    std::vector<std::pair<std::string, std::string> > extent_tokens;
-    cmd.get_cmd_line_argument_pairs("extents", extent_tokens);
-    for (auto e : extent_tokens) {
-      if (std::get<0>(e).size() > 1) {
-        std::cerr << "ERROR: Mode name must only be 1 character long.\n";
-        print_usage();
-        exit(1);
-      }
-      char label = std::get<0>(e)[0];
-      int  size  = std::stoi(std::get<1>(e));
-      mode_size[label] = size;
-    }
-    // Print out symbolic modes and their extents
-    if (parse_verbose) {
-      std::cout << "C_" << c_mode << " = A_" << a_mode << " * B_" << b_mode << "\n";
-      for (auto e : mode_size) std::cout << "     " << std::get<0>(e) << " : " << std::get<1>(e) << "\n";
-    }
-    //
-    // Collect/Compute strides
-    //
-    std::map<char,stride_type> mode_ldA;
-    std::map<char,stride_type> mode_ldB;
-    std::map<char,stride_type> mode_ldC;
-    {
-      stride_type current;
-      current = 1;
-      for (char a : a_mode) { mode_ldA[a] = current; current *= mode_size[a]; }
-      current = 1;
-      for (char b : b_mode) { mode_ldB[b] = current; current *= mode_size[b]; }
-      current = 1;
-      for (char c : c_mode) { mode_ldC[c] = current; current *= mode_size[c]; }
-    }
-    //
-    // Collect mode categories
-    //
-    std::vector<char> row_mode;  // rows
-    std::vector<char> col_mode;  // columns
-    std::vector<char> red_mode;  // reductions
-    std::vector<char> bat_mode;  // batches
-    {
-      std::vector<char> a_label = a_mode;
-      std::vector<char> b_label = b_mode;
-      std::vector<char> c_label = c_mode;
-      std::sort(std::begin(a_label), std::end(a_label));
-      std::sort(std::begin(b_label), std::end(b_label));
-      std::sort(std::begin(c_label), std::end(c_label));
-      // std::set_intersections to find semantic category of each symbolic mode
-      std::set_intersection(std::begin(a_label), std::end(a_label),
-                            std::begin(c_label), std::end(c_label),
-                            std::back_inserter(row_mode));
-      std::set_intersection(std::begin(b_label), std::end(b_label),
-                            std::begin(c_label), std::end(c_label),
-                            std::back_inserter(col_mode));
-      std::set_intersection(std::begin(a_label), std::end(a_label),
-                            std::begin(b_label), std::end(b_label),
-                            std::back_inserter(red_mode));
-      std::set_intersection(std::begin(row_mode), std::end(row_mode),
-                            std::begin(col_mode), std::end(col_mode),
-                            std::back_inserter(bat_mode));
-      // std::set_difference to remove batch modes from other semantic modes
-      for (char l : bat_mode) {
-        row_mode.erase(std::remove(std::begin(row_mode), std::end(row_mode), l), std::end(row_mode));
-        col_mode.erase(std::remove(std::begin(col_mode), std::end(col_mode), l), std::end(col_mode));
-        red_mode.erase(std::remove(std::begin(red_mode), std::end(red_mode), l), std::end(red_mode));
-      }
-    }
-    // Print out the semantic association of each symbolic mode
-    if (parse_verbose) {
-      std::cout << "  rows : " << row_mode << '\n';
-      std::cout << "  cols : " << col_mode << '\n';
-      std::cout << "  reds : " << red_mode << '\n';
-      std::cout << "  bats : " << bat_mode << '\n';
-    }
-    //
-    // Permute modes
-    //
-    // Permute the batched modes to promote coalescing
-    // Sort the batched modes by min(ldAl,ldBl) and in case of a tie by the size
-    std::sort(std::begin(bat_mode), std::end(bat_mode), [&](char l1, char l2) {
-        return std::tie(std::min(mode_ldA[l1],mode_ldB[l1]),mode_size[l1])
-             < std::tie(std::min(mode_ldA[l2],mode_ldB[l2]),mode_size[l2]);
-      });
-    // Compute sizes and strides of ordered reduction modes
-    std::vector<extent_type> L;
-    std::vector<stride_type> ldAl;
-    std::vector<stride_type> ldBl;
-    std::vector<stride_type> ldCl;
-    for (char l : bat_mode) {
-      L.push_back(mode_size[l]);
-      ldAl.push_back(mode_ldA[l]);
-      ldBl.push_back(mode_ldB[l]);
-      ldCl.push_back(mode_ldC[l]);
-    }
-    // Permute the reduction modes to promote coalescing
-    // Sort the reduction modes by min(ldAk,ldBk) and in case of a tie by the size
-    std::sort(std::begin(red_mode), std::end(red_mode), [&](char k1, char k2) {
-        return std::tie(std::min(mode_ldA[k1],mode_ldB[k1]),mode_size[k1])
-             < std::tie(std::min(mode_ldA[k2],mode_ldB[k2]),mode_size[k2]);
-      });
-    // Compute sizes and strides of ordered reduction modes
-    std::vector<extent_type> K;
-    std::vector<stride_type> ldAk;
-    std::vector<stride_type> ldBk;
-    for (char k : red_mode) {
-      K.push_back(mode_size[k]);
-      ldAk.push_back(mode_ldA[k]);
-      ldBk.push_back(mode_ldB[k]);
-    }
-    // Permute the row modes to promote coalescing
-    // Sort the row modes by min(ldAm,ldCm) and in case of a tie by ldAm
-    std::sort(std::begin(row_mode), std::end(row_mode), [&](char m1, char m2) {
-        return std::tie(std::min(mode_ldA[m1],mode_ldC[m1]),mode_ldA[m1])
-             < std::tie(std::min(mode_ldA[m2],mode_ldC[m2]),mode_ldA[m2]);
-      });
-    // Compute sizes and strides of ordered row modes
-    std::vector<extent_type> M;
-    std::vector<stride_type> ldAm;
-    std::vector<stride_type> ldCm;
-    for (char m : row_mode) {
-      M.push_back(mode_size[m]);
-      ldAm.push_back(mode_ldA[m]);
-      ldCm.push_back(mode_ldC[m]);
-    }
-    // Permute the col modes to promote coalescing
-    // Sort the col modes by min(ldBn,ldCn) and in case of a tie by ldBn
-    std::sort(std::begin(col_mode), std::end(col_mode), [&](char n1, char n2) {
-        return std::tie(std::min(mode_ldB[n1],mode_ldC[n1]),mode_ldB[n1])
-             < std::tie(std::min(mode_ldB[n2],mode_ldC[n2]),mode_ldB[n2]);
-      });
-    // Compute sizes and strides of ordered col modes
-    std::vector<extent_type> N;
-    std::vector<stride_type> ldBn;
-    std::vector<stride_type> ldCn;
-    for (char n : col_mode) {
-      N.push_back(mode_size[n]);
-      ldBn.push_back(mode_ldB[n]);
-      ldCn.push_back(mode_ldC[n]);
-    }
-    if (parse_verbose) {
-      std::cout << "C_";
-      if (! row_mode.empty()) {
-        std::cout << "(" << row_mode << ")";
-      }
-      if (! col_mode.empty()) {
-        std::cout << "(" << col_mode << ")";
-      }
-      if (! bat_mode.empty()) {
-        std::cout << "(" << bat_mode << ")";
-      }
-      std::cout << " = A_";
-      if (! row_mode.empty()) {
-        std::cout << "(" << row_mode << ")";
-      }
-      if (! red_mode.empty()) {
-        std::cout << "(" << red_mode << ")";
-      }
-      if (! bat_mode.empty()) {
-        std::cout << "(" << bat_mode << ")";
-      }
-      std::cout << " * B_";
-      if (! col_mode.empty()) {
-        std::cout << "(" << col_mode << ")";
-      }
-      if (! red_mode.empty()) {
-        std::cout << "(" << red_mode << ")";
-      }
-      if (! bat_mode.empty()) {
-        std::cout << "(" << bat_mode << ")";
-      }
-      std::cout << '\n';
-      int M_size = std::accumulate(std::begin(M), std::end(M), 1, std::multiplies<>{});
-      int N_size = std::accumulate(std::begin(N), std::end(N), 1, std::multiplies<>{});
-      int K_size = std::accumulate(std::begin(K), std::end(K), 1, std::multiplies<>{});
-      int L_size = std::accumulate(std::begin(L), std::end(L), 1, std::multiplies<>{});
-      std::cout << "     M : (" << M_size << ") ";
-      for (char m : row_mode) std::cout << m << ":" << mode_size[m] << " ";
-      std::cout << '\n';
-      std::cout << "     N : (" << N_size << ") ";
-      for (char n : col_mode) std::cout << n << ":" << mode_size[n] << " ";
-      std::cout << '\n';
-      std::cout << "     K : (" << K_size << ") ";
-      for (char k : red_mode) std::cout << k << ":" << mode_size[k] << " ";
-      std::cout << '\n';
-      std::cout << "     L : (" << L_size << ") ";
-      for (char l : bat_mode) std::cout << l << ":" << mode_size[l] << " ";
-      std::cout << '\n';
-      std::cout << "  ldAm : " << ldAm << '\n';
-      std::cout << "  ldAk : " << ldAk << '\n';
-      std::cout << "  ldAl : " << ldAl << '\n';
-      std::cout << "  ldBn : " << ldBn << '\n';
-      std::cout << "  ldBk : " << ldBk << '\n';
-      std::cout << "  ldBl : " << ldBl << '\n';
-      std::cout << "  ldCm : " << ldCm << '\n';
-      std::cout << "  ldCn : " << ldCn << '\n';
-      std::cout << "  ldCl : " << ldCl << '\n';
-    }
-    return {M, ldAm, ldCm,
-            N, ldBn, ldCn,
-            K, ldAk, ldBk,
-            L, ldAl, ldBl, ldCl};
-  }
-  static void
-  print_usage() {
-    std::cout <<
-      "GETT problem command line parser:\n"
-      "  --modeA=<m0,...>\n"
-      "    A comma delimited list of characters that correspond to the row, reduction, and batch modes in A tensor.\n"
-      "    The semantic association of each symbolic mode is determined automatically.\n\n"
-      "  --modeB=<m0,...>\n"
-      "    A comma delimited list of characters that correspond to the column, reduction, and batch modes in B tensor.\n"
-      "    The semantic association of each symbolic mode is determined automatically.\n\n"
-      "  --modeC=<m0,...>\n"
-      "    A comma delimited list of characters that correspond to the row, column, and batch modes in B tensor.\n"
-      "    The semantic association of each symbolic mode is determined automatically.\n\n"
-      "  --extents=<mode:extent,....>\n"
-      "    A command delimited list of symbolic mode and its corresponding extent.\n"
-      "    Extents are defaulted to 1 if any are not provided.\n\n"
-      "Example usage: gett.exe --modeC=m,n,l --modeA=m,k,l --modeB=k,n,l --extents=m:4096,n:4096,k:4096\n";
-  }
-};
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/helper_cuda.hpp DELETED Viewed

@@ -1,116 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include <cuda.h>
-#include <cute/util/debug.hpp>
-namespace cute
-{
-void
-device_init(int device_id, bool quiet = false)
-{
-  cudaDeviceProp device_prop;
-  std::size_t    device_free_physmem;
-  std::size_t    device_total_physmem;
-  CUTE_CHECK_ERROR(cudaSetDevice(device_id));
-  CUTE_CHECK_ERROR(cudaMemGetInfo(&device_free_physmem, &device_total_physmem));
-  CUTE_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id));
-  if (device_prop.major < 1) {
-    fprintf(stderr, "Device does not support CUDA.\n");
-    exit(1);
-  }
-  //float device_giga_bandwidth = float(device_prop.memoryBusWidth) * device_prop.memoryClockRate * 2 / 8 / 1000 / 1000;
-  if (!quiet) {
-    printf("Using device %d: %s  (SM%d, %d SMs)\n",
-           device_id, device_prop.name,
-           device_prop.major * 10 + device_prop.minor,
-           device_prop.multiProcessorCount);
-    fflush(stdout);
-  }
-}
-/**
- * Convert the SM version (e.g. v7.0, v7.5) to the physical number of cores.
- */
-inline int
-_ConvertSMVer2Cores(int major, int minor)
-{
-  // Defines for GPU Architecture types (using the SM version to determine
-  // the # of cores per SM
-  typedef struct {
-    int SM;  // 0xMm (hexadecimal notation), M = SM Major version,
-    // and m = SM minor version
-    int Cores;
-  } sSMtoCores;
-  sSMtoCores nGpuArchCoresPerSM[] = {
-      {0x30, 192},
-      {0x32, 192},
-      {0x35, 192},
-      {0x37, 192},
-      {0x50, 128},
-      {0x52, 128},
-      {0x53, 128},
-      {0x60,  64},
-      {0x61, 128},
-      {0x62, 128},
-      {0x70,  64},
-      {0x72,  64},
-      {0x75,  64},
-      {-1, -1}};
-  int index = 0;
-  while (nGpuArchCoresPerSM[index].SM != -1) {
-    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
-      return nGpuArchCoresPerSM[index].Cores;
-    }
-    index++;
-  }
-  // If we don't find the values, we default use the previous one
-  // to run properly
-  printf("MapSMtoCores for SM %d.%d is undefined."
-         "  Default to use %d Cores/SM\n",
-         major, minor, nGpuArchCoresPerSM[index - 1].Cores);
-  return nGpuArchCoresPerSM[index - 1].Cores;
-}
-} // end namespace cute

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_reorder.h DELETED Viewed

@@ -1,111 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief reorder data from the host side
-*/
-#pragma once
-#include "cutlass/coord.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/util/tensor_view_io.h"
-#include "cutlass/util/reference/host/gemm.h"
-namespace cutlass {
-/// This is needed for the interleaved integer tensor core kernels.  The purpose
-/// is to use skip the shared memory part in the epilogue.
-template <int Interleaved, typename Element, typename Layout>
-void reorder_column(TensorRef<Element, Layout> dest,
-                    TensorRef<Element, Layout> src,
-                    cutlass::gemm::GemmCoord problem_size) {
-  const int InstructionShapeCol = 8;
-  // 4 threads per Quad
-  const int ElementsPerThread = InstructionShapeCol / 4;
-  // 4 threads per Quad
-  const int ReorderedElementsPerThread =
-      Interleaved / 4;
-  for (int n = 0; n < problem_size.n(); n++) {
-    for (int k = 0; k < problem_size.k(); k++) {
-      dest.at({k, (n / Interleaved) * Interleaved +
-                      ((n % ReorderedElementsPerThread) / ElementsPerThread) *
-                          InstructionShapeCol +
-                      ((n % Interleaved) / ReorderedElementsPerThread) *
-                          ElementsPerThread +
-                      (n % ElementsPerThread)}) = src.at({k, n});
-    }
-  }
-}
-template <int ColumnInterleaved, int LayoutInterleaved = ColumnInterleaved, typename Element, typename Layout>
-void reorder_convK(TensorRef<Element, Layout> dest,
-                    TensorRef<Element, Layout> src,
-                    cutlass::gemm::GemmCoord problem_size) {
-    TensorRef<Element, layout::RowMajorInterleaved<LayoutInterleaved>> mappedDest(dest.data(), dest.stride(0));
-    TensorRef<Element, layout::RowMajorInterleaved<LayoutInterleaved>> mappedSrc(src.data(), src.stride(0));
-    reorder_column<ColumnInterleaved>(
-        mappedDest, mappedSrc, problem_size);
-}
-/// This is needed for the sparse tensor core kernels.  The purpose
-/// is to use ldmatrix to load from shared memory to the register file.
-template <typename Element, typename LayoutDest, typename LayoutSrc>
-void reorder_meta(TensorRef<Element, LayoutDest> dest,
-                  TensorRef<Element, LayoutSrc> src,
-                  cutlass::gemm::GemmCoord problem_size) {
-  for (int m = 0; m < problem_size.m(); m++) {
-    for (int k = 0; k < problem_size.k(); k++) {
-      // First reorder the rows.
-      int group = (sizeof(Element) == 2) ? 32 : 16;
-      int interweave = (sizeof(Element) == 2) ? 4 : 2;
-      int dest_row = m / group * group + (m % 8) * interweave + (m % group) / 8;
-      int dest_col = k;
-      // Next swizzle the 2x2 blocks from Z to N.
-      if (((dest_row % 2) == 0) && ((dest_col % 2) == 1)) {
-        ++dest_row;
-        --dest_col;
-      } else if (((dest_row % 2) == 1) && ((dest_col % 2) == 0)) {
-        --dest_row;
-        ++dest_col;
-      }
-      dest.at({dest_row, dest_col}) = src.at({m, k});
-    }
-  }
-}
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_tensor.h DELETED Viewed

@@ -1,541 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-/*! \file
-  \brief HostTensor contributes management for both host and device memory.
-  HostTensor allocates host and device memory upon construction. Basic element-wise operations on
-  host memory synchronize device memory automatically. Explicit copy operations provide abstractions
-  for CUDA memcpy operations.
-  Call {host, device}_{data, ref, view}() for accessing host or device memory.
-  See cutlass/tensor_ref.h and cutlass/tensor_view.h for more details.
-*/
-#include <vector>
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/fast_math.h"
-#include "device_memory.h"
-namespace cutlass {
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Host tensor
-template <
-  /// Data type of element stored within tensor (concept: NumericType)
-  typename Element_,
-  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
-  typename Layout_
->
-class HostTensor {
-public:
-  /// Data type of individual access
-  using Element = Element_;
-  /// Mapping function from logical coordinate to linear memory
-  using Layout = Layout_;
-  /// Logical rank of tensor index space
-  static int const kRank = Layout::kRank;
-  /// Index type
-  using Index = typename Layout::Index;
-  /// Long index used for pointer offsets
-  using LongIndex = typename Layout::LongIndex;
-  /// Coordinate in logical tensor space
-  using TensorCoord = typename Layout::TensorCoord;
-  /// Layout's stride vector
-  using Stride = typename Layout::Stride;
-  /// Tensor reference to device memory
-  using TensorRef = TensorRef<Element, Layout>;
-  /// Tensor reference to constant device memory
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-  /// Tensor reference to device memory
-  using TensorView = TensorView<Element, Layout>;
-  /// Tensor reference to constant device memory
-  using ConstTensorView = typename TensorView::ConstTensorView;
-  /// Reference to element in tensor
-  using Reference = typename TensorRef::Reference;
-  /// Constant reference to element in tensor
-  using ConstReference = typename ConstTensorRef::Reference;
-private:
-  using StorageUnit = typename platform::conditional_t<std::is_same_v<Element, bool>, uint8_t,            // Avoid the std::vector<bool> specialization
-                                  typename platform::conditional_t<sizeof_bits<Element>::value % 8 == 0,  // Handle subbyte types
-                                      Element, uint8_t>>;
-  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
-  static constexpr int kContainerTypeNumBits = StorageContainerCalculator::kContainerTypeNumBits;
-  static constexpr int kContainerTypeNumLogicalElements = StorageContainerCalculator::kContainerTypeNumLogicalElements;
-  static constexpr int kContainerTypeNumBytes = StorageContainerCalculator::kContainerTypeNumBytes;
-  static constexpr int kContainerTypeNumStorageUnit = StorageContainerCalculator::kContainerTypeNumStorageUnit;
-  //
-  // Data members
-  //
-  /// Extent of tensor in logical dimensions
-  TensorCoord extent_;
-  /// Layout object
-  Layout layout_;
-  /// Host-side memory allocation
-  std::vector<StorageUnit> host_;
-  /// Device-side memory
-  device_memory::allocation<StorageUnit> device_;
-  /// number of containers
-  size_t count_to_container_storage_unit_count(size_t count) {
-    return (count + kContainerTypeNumLogicalElements - 1) / kContainerTypeNumLogicalElements * kContainerTypeNumStorageUnit;
-  }
-public:
-  //
-  // Device and Host Methods
-  //
-  /// Default constructor
-  HostTensor() {}
-  /// Constructs a tensor given an extent. Assumes a packed layout
-  HostTensor(
-    TensorCoord const &extent,
-    bool device_backed = true
-  ) {
-    this->reset(extent, Layout::packed(extent), device_backed);
-  }
-  /// Constructs a tensor given an extent and layout
-  HostTensor(
-    TensorCoord const &extent,
-    Layout const &layout,
-    bool device_backed = true
-  ) {
-    this->reset(extent, layout, device_backed);
-  }
-  ~HostTensor() { }
-  /// Clears the HostTensor allocation to size/capacity = 0
-  void reset() {
-    extent_ = TensorCoord();
-    layout_ = Layout::packed(extent_);
-    host_.clear();
-    device_.reset();
-  }
-  /// Resizes internal memory allocations without affecting layout or extent
-  void reserve(
-    size_t count,                                        ///< size of tensor in elements
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count=" << count << ", device_backed_=" << (device_backed_ ? "true" : "false") << ")");
-#endif
-    device_.reset();
-    host_.clear();
-    size_t count_container = count_to_container_storage_unit_count(count);
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize(" << count_container << ")");
-#endif
-    host_.resize(count_container);
-    // Allocate memory
-    StorageUnit* device_memory = nullptr;
-    if (device_backed_) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-      CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: device_memory::allocate(" << count_container << ")");
-#endif
-      device_memory = device_memory::allocate<StorageUnit>(count_container);
-    }
-    device_.reset(device_memory, device_backed_ ? count_container : 0);
-  }
-  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
-  /// extent and layout.
-  void reset(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    Layout const &layout,                                ///< layout object of tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
-    extent_ = extent;
-    layout_ = layout;
-    reserve(size_t(layout_.capacity(extent_)), device_backed_);
-  }
-  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
-  /// extent and layout. Assumes a packed tensor configuration.
-  void reset(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
-    reset(extent, Layout::packed(extent), device_backed_);
-  }
-  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
-  /// To force allocation, call reset().
-  void resize(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    Layout const &layout,                                ///< layout object of tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
-    extent_ = extent;
-    layout_ = layout;
-    LongIndex new_size = size_t(layout_.capacity(extent_));
-    LongIndex new_size_container = count_to_container_storage_unit_count((layout_.capacity(extent_)));
-    if (static_cast<decltype(host_.size())>(new_size_container) > host_.size()) {
-      reserve(new_size, device_backed_);
-    }
-  }
-  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
-  /// To force allocation, call reset(). Note, this form of resize() assumes a packed tensor configuration.
-  void resize(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
-    resize(extent, Layout::packed(extent), device_backed_);
-  }
-  /// Returns the logical number of elements stored in the host tensor
-  size_t size() const {
-    return layout_.capacity(extent_);
-  }
-  /// Returns the logical capacity in terms of number of elements. May be larger than the size().
-  LongIndex capacity() const {
-    return host_.size() / kContainerTypeNumStorageUnit * kContainerTypeNumLogicalElements;
-  }
-  /// Gets pointer to host data
-  Element * host_data() { return reinterpret_cast<Element *>(host_.data()); }
-  /// Gets pointer to host data with a pointer offset
-  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
-  /// Gets a reference to an element in host memory
-  Reference host_data(LongIndex idx) {
-    return ReferenceFactory<Element>::get(host_data(), idx);
-  }
-  /// Gets pointer to host data
-  Element const * host_data() const { return reinterpret_cast<Element const *>(host_.data()); }
-  /// Gets pointer to host data with a pointer offset
-  Element const * host_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
-  /// Gets a constant reference to an element in host memory
-  ConstReference host_data(LongIndex idx) const {
-    return ReferenceFactory<Element const>::get(host_data(), idx);
-  }
-  /// Gets pointer to device data
-  Element * device_data() { return reinterpret_cast<Element *>(device_.get()); }
-  /// Gets pointer to device data
-  Element const * device_data() const { return reinterpret_cast<Element const *>(device_.get()); }
-  /// Gets pointer to device data with a pointer offset
-  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
-  /// Gets pointer to device data with a pointer offset
-  Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
-  /// Accesses the tensor reference pointing to data
-  TensorRef host_ref(LongIndex ptr_element_offset=0) { return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
-  /// Accesses the tensor reference pointing to data
-  ConstTensorRef host_ref(LongIndex ptr_element_offset=0) const { return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
-  /// Accesses the tensor reference pointing to data
-  TensorRef device_ref(LongIndex ptr_element_offset=0) {
-    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
-  }
-  /// Accesses the tensor reference pointing to data
-  ConstTensorRef device_ref(LongIndex ptr_element_offset=0) const {
-    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
-  }
-  /// Accesses the tensor reference pointing to data
-  TensorView host_view(LongIndex ptr_element_offset=0) {
-    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
-  }
-  /// Accesses the tensor reference pointing to data
-  ConstTensorView host_view(LongIndex ptr_element_offset=0) const {
-    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
-  }
-  /// Accesses the tensor reference pointing to data
-  TensorView device_view(LongIndex ptr_element_offset=0) {
-    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
-  }
-  /// Accesses the tensor reference pointing to data
-  ConstTensorView device_view(LongIndex ptr_element_offset=0) const {
-    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
-  }
-  /// Returns true if device memory is allocated
-  bool device_backed() const {
-    return (device_.get() == nullptr) ? false : true;
-  }
-  /// Returns the layout object
-  Layout & layout() {
-    return layout_;
-  }
-  /// Returns the layout object
-  Layout layout() const {
-    return layout_;
-  }
-  /// Returns the layout object's stride vector
-  Stride stride() const {
-    return layout_.stride();
-  }
-  /// Returns the layout object's stride vector
-  Stride & stride() {
-    return layout_.stride();
-  }
-  /// Returns the layout object's stride in a given physical dimension
-  LongIndex stride(int dim) const {
-    return layout_.stride().at(dim);
-  }
-  /// Returns the layout object's stride in a given physical dimension
-  LongIndex & stride(int dim) {
-    return layout_.stride().at(dim);
-  }
-  /// Computes the offset of an index from the origin of the tensor
-  LongIndex offset(TensorCoord const& coord) const {
-    return layout_(coord);
-  }
-  /// Returns a reference to the element at the logical Coord in host memory
-  Reference at(TensorCoord const& coord) {
-    return host_data(offset(coord));
-  }
-  /// Returns a const reference to the element at the logical Coord in host memory
-  ConstReference at(TensorCoord const& coord) const {
-    return host_data(offset(coord));
-  }
-  /// Returns the extent of the tensor
-  TensorCoord extent() const {
-    return extent_;
-  }
-  /// Returns the extent of the tensor
-  TensorCoord & extent() {
-    return extent_;
-  }
-  /// Copies data from device to host
-  void sync_host() {
-    if (device_backed()) {
-      device_memory::copy_to_host(
-          host_.data(), device_.get(), device_.size());
-    }
-  }
-  /// Copies data from host to device
-  void sync_device() {
-    if (device_backed()) {
-      device_memory::copy_to_device(
-          device_.get(), host_.data(), host_.size());
-    }
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_device_to_host(
-    Element const* ptr_device,        ///< source device memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_host(
-      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_device_to_device(
-    Element const* ptr_device,        ///< source device memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_device_to_device(
-      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_host_to_device(
-    Element const* ptr_host,          ///< source host memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_device(
-      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_host_to_host(
-    Element const* ptr_host,          ///< source host memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_host_to_host(
-      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_device_to_host(
-    Element * ptr_host,               ///< source device memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_host(
-      reinterpret_cast<StorageUnit *>(ptr_host), device_.get(), container_count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_device_to_device(
-    Element * ptr_device,             ///< source device memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_device_to_device(
-      reinterpret_cast<StorageUnit *>(ptr_device), device_.get(), container_count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_host_to_device(
-    Element * ptr_device,             ///< source host memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_device(
-      reinterpret_cast<StorageUnit *>(ptr_device), host_.data(), container_count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_host_to_host(
-    Element * ptr_host,               ///< source host memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_host_to_host(
-      reinterpret_cast<StorageUnit *>(ptr_host), host_.data(), container_count);
-  }
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-}  // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_tensor_planar_complex.h DELETED Viewed

@@ -1,591 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-/*! \file
-  \brief HostTensor contributes management for both host and device memory.
-  HostTensor allocates host and device memory upon construction. Basic element-wise operations on
-  host memory synchronize device memory automatically. Explicit copy operations provide abstractions
-  for CUDA memcpy operations.
-  Call {host, device}_{data, ref, view}() for accessing host or device memory.
-  See cutlass/tensor_ref.h and cutlass/tensor_view.h for more details.
-*/
-#include <vector>
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref_planar_complex.h"
-#include "cutlass/tensor_view_planar_complex.h"
-#include "device_memory.h"
-namespace cutlass {
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Host tensor
-template <
-  /// Data type of element stored within tensor (concept: NumericType)
-  typename Element_,
-  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
-  typename Layout_
->
-class HostTensorPlanarComplex {
-public:
-  /// Data type of individual access
-  using Element = Element_;
-  /// Mapping function from logical coordinate to linear memory
-  using Layout = Layout_;
-  /// Logical rank of tensor index space
-  static int const kRank = Layout::kRank;
-  /// Index type
-  using Index = typename Layout::Index;
-  /// Long index used for pointer offsets
-  using LongIndex = typename Layout::LongIndex;
-  /// Coordinate in logical tensor space
-  using TensorCoord = typename Layout::TensorCoord;
-  /// Layout's stride vector
-  using Stride = typename Layout::Stride;
-  /// Tensor reference to device memory
-  using TensorRef = TensorRefPlanarComplex<Element, Layout>;
-  /// Tensor reference to constant device memory
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-  /// Tensor reference to device memory
-  using TensorView = TensorViewPlanarComplex<Element, Layout>;
-  /// Tensor reference to constant device memory
-  using ConstTensorView = typename TensorView::ConstTensorView;
-  /// Reference to element in tensor
-  using Reference = typename TensorRef::Reference;
-  /// Constant reference to element in tensor
-  using ConstReference = typename ConstTensorRef::Reference;
- private:
-  //
-  // Data members
-  //
-  /// Extent of tensor in logical dimensions
-  TensorCoord extent_;
-  /// Layout object
-  Layout layout_;
-  /// Host-side memory allocation
-  std::vector<Element> host_;
-  /// Device-side memory
-  device_memory::allocation<Element> device_;
- public:
-  //
-  // Device and Host Methods
-  //
-  /// Default constructor
-  HostTensorPlanarComplex() {}
-  /// Constructs a tensor given an extent. Assumes a packed layout
-  HostTensorPlanarComplex(
-    TensorCoord const &extent,
-    bool device_backed = true
-  ) {
-    this->reset(extent, Layout::packed(extent), device_backed);
-  }
-  /// Constructs a tensor given an extent and layout
-  HostTensorPlanarComplex(
-    TensorCoord const &extent,
-    Layout const &layout,
-    bool device_backed = true
-  ) {
-    this->reset(extent, layout, device_backed);
-  }
-  ~HostTensorPlanarComplex() { }
-  /// Clears the HostTensor allocation to size/capacity = 0
-  void reset() {
-    extent_ = TensorCoord();
-    layout_ = Layout::packed(extent_);
-    host_.clear();
-    device_.reset();
-  }
-  /// Resizes internal memory allocations without affecting layout or extent
-  void reserve(
-    size_t count,                                        ///< size of tensor in elements
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated
-    device_.reset();
-    host_.clear();
-    host_.resize(count * 2);
-    // Allocate memory
-    Element* device_memory = nullptr;
-    if (device_backed_) {
-      device_memory = device_memory::allocate<Element>(count * 2);
-    }
-    device_.reset(device_memory, device_backed_ ? count * 2 : 0);
-  }
-  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
-  /// extent and layout.
-  void reset(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    Layout const &layout,                                ///< layout object of tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
-    extent_ = extent;
-    layout_ = layout;
-    reserve(size_t(layout_.capacity(extent_)), device_backed_);
-  }
-  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
-  /// extent and layout. Assumes a packed tensor configuration.
-  void reset(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
-    reset(extent, Layout::packed(extent), device_backed_);
-  }
-  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
-  /// To force allocation, call reset().
-  void resize(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    Layout const &layout,                                ///< layout object of tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
-    extent_ = extent;
-    layout_ = layout;
-    LongIndex new_size = size_t(layout_.capacity(extent_));
-    if (static_cast<decltype(host_.size())>(new_size * 2) > host_.size()) {
-      reserve(new_size);
-    }
-  }
-  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
-  /// To force allocation, call reset(). Note, this form of resize() assumes a packed tensor configuration.
-  void resize(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated.
-    resize(extent, Layout::packed(extent), device_backed_);
-  }
-  /// Returns the number of elements stored in the host tensor
-  size_t size() const {
-    return host_.size() / 2;
-  }
-  /// Returns the logical capacity based on extent and layout. May differ from size().
-  LongIndex capacity() const {
-    return layout_.capacity(extent_);
-  }
-  /// Stride between real and imaginary parts
-  LongIndex imaginary_stride() const {
-    return host_.size() / 2;
-  }
-  /// Gets pointer to host data
-  Element * host_data() { return host_.data(); }
-  /// Gets pointer to host data imaginary part
-  Element * host_data_imag() { return host_.data() + imaginary_stride(); }
-  /// Gets pointer to host data with a pointer offset
-  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return host_data() + ptr_element_offset; }
-  /// Gets pointer to host data with a pointer offset
-  Element * host_data_imag_ptr_offset(LongIndex ptr_element_offset) { return host_data_imag() + ptr_element_offset; }
-  /// Gets a reference to an element in host memory
-  Reference host_data(LongIndex idx) {
-    return PlanarComplexReference<Element>(host_data() + idx, host_data_imag() + idx);
-  }
-  /// Gets pointer to host data
-  Element const * host_data() const { return host_.data(); }
-  /// Gets pointer to host data imaginary part
-  Element const * host_data_imag() const { return host_.data() + imaginary_stride(); }
-  /// Gets a constant reference to an element in host memory
-  ConstReference host_data(LongIndex idx) const {
-    return PlanarComplexReference<Element const>(host_data() + idx, host_data_imag() + idx);
-  }
-  /// Gets pointer to device data
-  Element * device_data() { return device_.get(); }
-  /// Gets pointer to device data with a pointer offset
-  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return device_.get() + ptr_element_offset; }
-  /// Gets pointer to device data
-  Element const * device_data() const { return device_.get(); }
-  /// Gets pointer to device data with a pointer offset
-  Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return device_.get() + ptr_element_offset; }
-  /// Gets a pointer to the device data imaginary part
-  Element * device_data_imag() { return device_.get() + imaginary_stride(); }
-  /// Accesses the tensor reference pointing to data
-  TensorRef host_ref(LongIndex ptr_element_offset=0) {
-    return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride());
-  }
-  /// Returns a tensor reference to the real part of the tensor
-  cutlass::TensorRef<Element, Layout> host_ref_real() {
-    return cutlass::TensorRef<Element, Layout>(host_data(), layout_);
-  }
-  /// Returns a tensor reference to the real part of the tensor
-  cutlass::TensorRef<Element, Layout> host_ref_imag() {
-    return cutlass::TensorRef<Element, Layout>(host_data_ptr_offset(imaginary_stride()), layout_);
-  }
-  /// Accesses the tensor reference pointing to data
-  ConstTensorRef host_ref(LongIndex ptr_element_offset=0) const {
-    return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride());
-  }
-  /// Accesses the tensor reference pointing to data
-  TensorRef device_ref(LongIndex ptr_element_offset=0) {
-    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride());
-  }
-  /// Accesses the tensor reference pointing to data
-  ConstTensorRef device_ref(LongIndex ptr_element_offset=0) const {
-    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride());
-  }
-  /// Returns a tensor reference to the real part of the tensor
-  cutlass::TensorRef<Element, Layout> device_ref_real() {
-    return cutlass::TensorRef<Element, Layout>(device_data(), layout_);
-  }
-  /// Returns a tensor reference to the real part of the tensor
-  cutlass::TensorRef<Element, Layout> device_ref_imag() {
-    return cutlass::TensorRef<Element, Layout>(device_data_ptr_offset(imaginary_stride()), layout_);
-  }
-  /// Accesses the tensor reference pointing to data
-  TensorView host_view(LongIndex ptr_element_offset=0) {
-    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride(), extent_);
-  }
-  /// Accesses the tensor reference pointing to data
-  ConstTensorView host_view(LongIndex ptr_element_offset=0) const {
-    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride(), extent_);
-  }
-  /// Accesses the tensor reference pointing to data
-  cutlass::TensorView<Element, Layout> host_view_real() {
-    return cutlass::TensorView<Element, Layout>(host_data(), layout_, extent_);
-  }
-  /// Accesses the tensor reference pointing to data
-  cutlass::TensorView<Element, Layout> host_view_imag() {
-    return cutlass::TensorView<Element, Layout>(host_data_ptr_offset(imaginary_stride()), layout_, extent_);
-  }
-  /// Accesses the tensor reference pointing to data
-  TensorView device_view(LongIndex ptr_element_offset=0) {
-    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride(), extent_);
-  }
-  /// Accesses the tensor reference pointing to data
-  ConstTensorView device_view(LongIndex ptr_element_offset=0) const {
-    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_, imaginary_stride(), extent_);
-  }
-  /// Accesses the tensor reference pointing to data
-  cutlass::TensorView<Element, Layout> device_view_real() {
-    return cutlass::TensorView<Element, Layout>(device_data(), layout_, extent_);
-  }
-  /// Accesses the tensor reference pointing to data
-  cutlass::TensorView<Element, Layout> device_view_imag() {
-    return cutlass::TensorView<Element, Layout>(device_data_ptr_offset(imaginary_stride()), layout_, extent_);
-  }
-  /// Returns true if device memory is allocated
-  bool device_backed() const {
-    return (device_.get() == nullptr) ? false : true;
-  }
-  /// Returns the layout object
-  Layout layout() const {
-    return layout_;
-  }
-  /// Returns the layout object's stride vector
-  Stride stride() const {
-    return layout_.stride();
-  }
-  /// Returns the layout object's stride in a given physical dimension
-  Index stride(int dim) const {
-    return layout_.stride().at(dim);
-  }
-  /// Computes the offset of an index from the origin of the tensor
-  LongIndex offset(TensorCoord const& coord) const {
-    return layout_(coord);
-  }
-  /// Returns a reference to the element at the logical Coord in host memory
-  Reference at(TensorCoord const& coord) {
-    return host_data(offset(coord));
-  }
-  /// Returns a const reference to the element at the logical Coord in host memory
-  ConstReference at(TensorCoord const& coord) const {
-    return host_data(offset(coord));
-  }
-  /// Returns the extent of the tensor
-  TensorCoord extent() const {
-    return extent_;
-  }
-  /// Returns the extent of the tensor
-  TensorCoord & extent() {
-    return extent_;
-  }
-  /// Copies data from device to host
-  void sync_host() {
-    if (device_backed()) {
-      device_memory::copy_to_host(
-          host_data(), device_data(), imaginary_stride() * 2);
-    }
-  }
-  /// Copies data from host to device
-  void sync_device() {
-    if (device_backed()) {
-      device_memory::copy_to_device(
-          device_data(), host_data(), imaginary_stride() * 2);
-    }
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_device_to_host(
-    Element const* ptr_device_real,   ///< source device memory
-    Element const* ptr_device_imag,   ///< source device memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    device_memory::copy_to_host(
-      host_data(), ptr_device_real, count);
-    device_memory::copy_to_host(
-      host_data_imag(), ptr_device_imag, count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_device_to_device(
-    Element const* ptr_device_real,   ///< source device memory
-    Element const* ptr_device_imag,   ///< source device memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    device_memory::copy_device_to_device(
-      device_data(), ptr_device_real, count);
-    device_memory::copy_device_to_device(
-      device_data_imag(), ptr_device_imag, count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_host_to_device(
-    Element const* ptr_host_real,      ///< source host memory
-    Element const* ptr_host_imag,      ///< source host memory
-    LongIndex count = -1) {            ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    device_memory::copy_to_device(
-      device_data(), ptr_host_real, count);
-    device_memory::copy_to_device(
-      device_data_imag(), ptr_host_imag, count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_host_to_host(
-    Element const* ptr_host_real,     ///< source host memory
-    Element const* ptr_host_imag,     ///< source host memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    device_memory::copy_host_to_host(
-      host_data(), ptr_host_real, count);
-    device_memory::copy_host_to_host(
-      host_data_imag(), ptr_host_imag, count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_device_to_host(
-    Element * ptr_host_real,           ///< source device memory
-    Element * ptr_host_imag,           ///< source device memory
-    LongIndex count = -1) const {      ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    device_memory::copy_to_host(
-      ptr_host_real, device_data(), count);
-    device_memory::copy_to_host(
-      ptr_host_imag, device_data_imag(), count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_device_to_device(
-    Element * ptr_device_real,        ///< source device memory
-    Element * ptr_device_imag,        ///< source device memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    device_memory::copy_device_to_device(
-      ptr_device_real, device_data(), count);
-    device_memory::copy_device_to_device(
-      ptr_device_imag, device_data_imag(), count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_host_to_device(
-    Element * ptr_device_real,        ///< source device memory
-    Element * ptr_device_imag,        ///< source device memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    device_memory::copy_to_device(
-      ptr_device_real, host_data(), count);
-    device_memory::copy_to_device(
-      ptr_device_imag, host_data_imag(), count);
-  }
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_host_to_host(
-    Element * ptr_host_real,          ///< source host memory
-    Element * ptr_host_imag,          ///< source host memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    device_memory::copy_host_to_host(
-      ptr_host_real, host_data(), count);
-    device_memory::copy_host_to_host(
-      ptr_host_imag, host_data_imag(), count);
-  }
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-}  // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/host_uncompress.h DELETED Viewed

@@ -1,157 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief uncompress sparse matrix from the host side
-*/
-#pragma once
-#include "cutlass/coord.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/util/tensor_view_io.h"
-#include "cutlass/util/reference/host/gemm.h"
-namespace cutlass {
-// uncompress sparse tensor core A matrix
-template <typename ElementA, typename LayoutA, typename ElementE,
-          typename LayoutE>
-void uncompress(TensorRef<ElementA, LayoutA> uncompressed_tensor_a,
-                TensorRef<ElementA, LayoutA> tensor_a,
-                TensorRef<ElementE, LayoutE> tensor_e, int row, int col) {
-  // How many uncompressed data we can get with ElementE meta data
-  int DecompressedElementsPerElementE =
-      256 / cutlass::sizeof_bits<ElementA>::value;
-  // Process 4bit meta data a time
-  int step;
-  // 1:2 or 2:4 or 4:8
-  int a, b;
-  if (cutlass::sizeof_bits<ElementA>::value == 4) {
-    step = 8;
-    a = 4;
-    b = 8;
-  } else if (cutlass::sizeof_bits<ElementA>::value == 8) {
-    step = 4;
-    a = 2;
-    b = 4;
-  } else if (cutlass::sizeof_bits<ElementA>::value == 16) {
-    step = 4;
-    a = 2;
-    b = 4;
-  } else if (cutlass::sizeof_bits<ElementA>::value == 32) {
-    step = 2;
-    a = 1;
-    b = 2;
-  }
-  int ElementsPerE = (cutlass::sizeof_bits<ElementA>::value == 4) ? 2 : 1;
-  for (int r = 0; r < row; ++r) {
-    for (int c = 0; c < (col / DecompressedElementsPerElementE); ++c) {
-      ElementE meta = tensor_e.at(MatrixCoord(r, c));
-      for (int i = 0; i < DecompressedElementsPerElementE; i += step) {
-        int e = (meta >> (i / step * 4)) & 0xf;
-        int idx0 = e & 0x3;
-        int idx1 = e >> 2;
-        if (a == 1) idx0 = idx0 / 2;
-        for (int ii = 0; ii < step; ii += ElementsPerE) {
-          int real_col =
-              c * DecompressedElementsPerElementE + i + ii;
-          int compressed_col = (real_col / b) * a;
-          if (ii == (idx0 * ElementsPerE)) {
-            uncompressed_tensor_a.at(MatrixCoord(r, real_col)) =
-                tensor_a.at(MatrixCoord(r, compressed_col));
-            if (ElementsPerE == 2)
-              uncompressed_tensor_a.at(MatrixCoord(r, real_col + 1)) =
-                  tensor_a.at(MatrixCoord(r, compressed_col + 1));
-          } else if ((ii == (idx1 * ElementsPerE)) && (a != 1)) {
-            uncompressed_tensor_a.at(MatrixCoord(r, real_col)) =
-                tensor_a.at(MatrixCoord(r, compressed_col + ElementsPerE));
-            if (ElementsPerE == 2)
-              uncompressed_tensor_a.at(MatrixCoord(r, real_col + 1)) =
-                  tensor_a.at(
-                      MatrixCoord(r, compressed_col + ElementsPerE + 1));
-          } else {
-            uncompressed_tensor_a.at(MatrixCoord(r, real_col)) =
-                ElementA(0);
-            if (ElementsPerE == 2)
-              uncompressed_tensor_a.at(MatrixCoord(r, real_col + 1)) =
-                  ElementA(0);
-          }
-        }
-      }
-    }
-  }
-}
-// uncompress ELL block sparse matrix
-template <typename ElementA, typename LayoutA,
-          typename ElementE, typename LayoutE>
-void uncompress_ell_block_sparse(
-                TensorRef<ElementA, LayoutA> uncompressed_tensor_a,
-                TensorRef<ElementA, LayoutA> tensor_a,
-                TensorRef<ElementE, LayoutE> ell_idx,
-                int rows, int cols,
-                int ell_num_cols, int ell_blocksize) {
-  for (int r = 0; r < rows / ell_blocksize; ++r) {
-    for (int c = 0; c < ell_num_cols / ell_blocksize; ++c) {
-      ElementE idx = ell_idx.at(MatrixCoord(r, c));
-      if (idx != -1) {
-        int row_begin = r * ell_blocksize;
-        int col_begin_real = idx * ell_blocksize;
-        int col_begin = c * ell_blocksize;
-        for (int i = 0; i < ell_blocksize; ++i) {
-          for (int j = 0; j < ell_blocksize; ++j) {
-            uncompressed_tensor_a.at(MatrixCoord(row_begin + i, col_begin_real + j)) =
-                tensor_a.at(
-                    MatrixCoord(row_begin + i, col_begin +j));
-          }
-        }
-      }
-    }
-  }
-}
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/index_sequence.h DELETED Viewed

@@ -1,38 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/numeric_types.h"
-// integer_sequence moved to cutlass/numeric_types.h

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/mixed_dtype_utils.hpp DELETED Viewed

@@ -1,472 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Utilities for mixed input data type kernels.
-*/
-#pragma once
-#include <cuda.h>
-#include "cute/layout.hpp"
-#include "cute/tensor.hpp"
-#include "cute/arch/mma_sm90.hpp"
-#include "cutlass/cutlass.h"
-#include "cutlass/util/device_memory.h"
-#include "cutlass/util/reference/device/tensor_fill.h"
-#include "cute/util/type_traits.hpp"
-namespace cutlass {
-#define CUDA_CHECK(status)                                              \
-  {                                                                     \
-    cudaError_t error = status;                                         \
-    if (error != cudaSuccess) {                                         \
-      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
-                << " at line: " << __LINE__ << std::endl;               \
-      exit(EXIT_FAILURE);                                               \
-    }                                                                   \
-  }
-template <
-  class QuantizedElement,
-  class DequantizedElement,
-  class OperandLayout,
-  class ElementScale,
-  class ElementZero,
-  class ScaleBroadCastLayout,
-  class ThrLayout>
-__global__ void dequantize_kernel(DequantizedElement* dq_buffer,
-                                  QuantizedElement const* q_buffer,
-                                  OperandLayout const operand_layout,
-                                  ElementScale const* scale_buffer,
-                                  ElementZero const* zero_buffer,
-                                  ScaleBroadCastLayout const broadcasted_scale_layout,
-                                  ThrLayout thr_layout) {
-  using namespace cute;
-  // Represent the full tensors to gmem elements.
-  // These are expected to have shape [MN, K, L]
-  cute::Tensor gmem_op_dq = cute::make_tensor(cute::make_gmem_ptr(dq_buffer), operand_layout);
-  cute::Tensor gmem_op_q  = cute::make_tensor(cute::make_gmem_ptr<QuantizedElement const>(q_buffer), operand_layout);
-  // While the scales are expected to have shape [MN, G, L] but with a stride to allow broadcasting
-  // It is expected that K % G == 0
-  cute::Tensor gmem_scale_broadcasted = cute::make_tensor(make_gmem_ptr(scale_buffer), broadcasted_scale_layout);
-  cute::Tensor gmem_zero_broadcasted = cute::make_tensor(make_gmem_ptr(zero_buffer), broadcasted_scale_layout);
-  // Assign 1 thread per element in the thread block
-  auto blk_shape = cute::make_shape(size<0>(thr_layout), _1{}, _1{}); //
-  auto blk_coord = cute::make_coord(_, blockIdx.x, blockIdx.y);  // (MN, K, L)
-  // Tile across the block
-  auto gOp_dq = cute::local_tile(gmem_op_dq, blk_shape, blk_coord);
-  auto gScale = cute::local_tile(gmem_scale_broadcasted, blk_shape, blk_coord);
-  auto gZero  = cute::local_tile(gmem_zero_broadcasted,  blk_shape, blk_coord);
-  auto gOp_q  = cute::local_tile(gmem_op_q, blk_shape, blk_coord);
-  auto tOpDq_gOpDq = cute::local_partition(gOp_dq, thr_layout, threadIdx.x);
-  auto tScale_gScale = cute::local_partition(gScale, thr_layout, threadIdx.x);
-  auto tZero_gZero = cute::local_partition(gZero, thr_layout, threadIdx.x);
-  auto tOpQ_gOpQ = cute::local_partition(gOp_q, thr_layout, threadIdx.x);
-  // Make a fragment of registers to hold gmem loads
-  cute::Tensor rmem_op_q = cute::make_fragment_like(tOpQ_gOpQ(_, _, _, 0));
-  cute::Tensor rmem_scale = cute::make_fragment_like(tScale_gScale(_, _, _, 0));
-  cute::Tensor rmem_zero = cute::make_fragment_like(tZero_gZero(_, _, _, 0));
-  cute::Tensor rmem_op_dq = cute::make_fragment_like(tOpDq_gOpDq(_, _, _, 0));
-  cute::Tensor rmem_op_scaled = cute::make_fragment_like<ElementScale>(rmem_op_dq);
-  cute::Tensor rmem_zero_buf = cute::make_fragment_like<ElementScale>(rmem_zero);
-  cute::Tensor pred_id = cute::make_identity_tensor(shape(operand_layout));
-  auto pred_blk_tile = cute::local_tile(pred_id, blk_shape, blk_coord);
-  auto pred_thr_partition = cute::local_partition(pred_blk_tile, thr_layout, threadIdx.x);
-  const auto num_iters = cute::size<3>(tOpDq_gOpDq);
-  for (int ii = 0; ii < num_iters; ++ii) {
-    const auto thread_offset = cute::get<0>(pred_thr_partition(0, 0, 0, ii));
-    if (thread_offset < cute::size<0>(operand_layout)) {
-      cute::copy(tOpQ_gOpQ(_, _, _, ii), rmem_op_q);
-      cute::copy(tScale_gScale(_, _, _, ii), rmem_scale);
-      cute::copy(tZero_gZero(_, _, _, ii), rmem_zero);
-      cute::transform(rmem_op_q, rmem_op_scaled, [] (const QuantizedElement& elt) { return ElementScale(elt); } );
-      cute::transform(rmem_zero, rmem_zero_buf, [] (const ElementZero& elt) { return ElementScale(elt); } );
-      cute::transform(rmem_op_scaled, rmem_scale, rmem_op_scaled, cute::multiplies{});
-      cute::transform(rmem_op_scaled, rmem_zero_buf, rmem_op_scaled, cute::plus{});
-      cute::transform(rmem_op_scaled, rmem_op_dq, [] (const ElementScale& elt) { return DequantizedElement(elt); } );
-      cute::copy(rmem_op_dq, tOpDq_gOpDq(_, _, _, ii));
-    }
-  }
-}
-template <
-  class QuantizedElement,
-  class DequantizedElement,
-  class OperandLayout,
-  class ElementScale,
-  class ElementZero,
-  class ScaleLayout>
-static void dequantize(DequantizedElement* dq_buffer,
-                       QuantizedElement const* q_buffer,
-                       OperandLayout const operand_layout,
-                       ElementScale const* scale_buffer,
-                       ElementZero const* zero_buffer,
-                       ScaleLayout const scale_layout,
-                       int const group_size,
-                       cudaStream_t &stream) {
-  using namespace cute;
-  constexpr int tpb = 128;
-  auto thr_layout = make_layout(make_shape(Int<tpb>{}));
-  const auto num_rows = get<0>(shape(operand_layout));
-  const auto gemm_k = get<1>(shape(operand_layout));   // [MN, K, L]
-  const auto batches = get<2>(shape(operand_layout));  // [MN, K, L]
-  const auto scale_k = get<1>(shape(scale_layout));    // [MN, Scale_K, L]
-  if (num_rows != size<0>(scale_layout)) {
-    std::cerr << "Invalid first dimension for scales. Must match first dim for weights."
-              << " But got shapes " << shape(operand_layout) << " " << shape(scale_layout)
-              << std::endl;
-    exit(-1);
-  }
-  const auto scale_stride0 = get<0>(stride(scale_layout));
-  const auto scale_stride1 = get<1>(stride(scale_layout));
-  const auto scale_stride2 = get<2>(stride(scale_layout));
-  auto scale_shape_bcast = make_shape(num_rows, make_shape(group_size, scale_k), batches);
-  auto scale_stride_bcast = make_stride(scale_stride0, make_stride(0, scale_stride1), scale_stride2);
-  auto scale_layout_bcast = make_layout(scale_shape_bcast, scale_stride_bcast);
-  const auto blocks_x = gemm_k;
-  const auto blocks_y = batches;
-  dim3 blocks(blocks_x, blocks_y, 1);
-  dequantize_kernel<<<blocks, tpb, 0, stream>>>(dq_buffer, q_buffer, operand_layout, scale_buffer, zero_buffer, scale_layout_bcast, thr_layout);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-}
-template <typename T>
-class packed_scale_t {
-public:
-  static_assert(cute::is_same_v<T, cutlass::int8_t> ||
-                cute::is_same_v<T, cutlass::uint8_t> ||
-                cute::is_same_v<T, cutlass::float_e4m3_t> ||
-                cute::is_same_v<T, cutlass::float_e5m2_t>,
-                "only 8 bit arithmetic types are supported.");
-  CUTLASS_HOST_DEVICE
-  explicit packed_scale_t(T val) {
-    if constexpr (!cute::is_unsigned_v<T>) {
-      // Only pack negative values. The positive values are generated in flight in the mainloop.
-      storage[0] = pack4(T(float(val) * -8.f), T(float(val) * -7.f), T(float(val) * -6.f), T(float(val) * -5.f));
-      storage[1] = pack4(T(float(val) * -4.f), T(float(val) * -3.f), T(float(val) * -2.f), -val);
-    }
-    else {
-      storage[0] = pack4(T(float(val) * 8.f), T(float(val) * 7.f), T(float(val) * 6.f), T(float(val) * 5.f));
-      storage[1] = pack4(T(float(val) * 4.f), T(float(val) * 3.f), T(float(val) * 2.f), val);
-    }
-  }
-  CUTLASS_HOST_DEVICE
-  packed_scale_t() = default;
-  CUTLASS_HOST_DEVICE
-  explicit operator float() const {
-    return float(get());
-  }
-  CUTLASS_HOST_DEVICE
-  bool operator==(packed_scale_t const& rhs) const {
-    return storage[0] == rhs.storage[0] && storage[1] == rhs.storage[1];
-  }
-  CUTLASS_HOST_DEVICE
-  bool operator!=(packed_scale_t const& rhs) const {
-    return !(*this == rhs);
-  }
-  CUTLASS_HOST_DEVICE
-  friend packed_scale_t operator+(packed_scale_t const& lhs, packed_scale_t const& rhs) {
-    return packed_scale_t(lhs.get() + rhs.get());
-  }
-  CUTLASS_HOST_DEVICE
-  friend packed_scale_t operator-(packed_scale_t const& lhs, packed_scale_t const& rhs) {
-    return packed_scale_t(lhs.get() - rhs.get());
-  }
-  CUTLASS_HOST_DEVICE
-  friend packed_scale_t operator*(packed_scale_t const& lhs, packed_scale_t const& rhs) {
-    return packed_scale_t(lhs.get() * rhs.get());
-  }
-  CUTLASS_HOST_DEVICE
-  friend packed_scale_t operator/(packed_scale_t const& lhs, packed_scale_t const& rhs) {
-    return packed_scale_t(lhs.get() / rhs.get());
-  }
-private:
-  using Storage = uint32_t;
-  using Stage = uint8_t;
-  Storage storage[2] {};
-  CUTLASS_HOST_DEVICE
-  static Storage pack4(T c1, T c2, T c3, T c4) {
-    Storage result = 0;
-    result |= (static_cast<Storage>(reinterpret_cast<Stage const&>(c4)) << 24);
-    result |= (static_cast<Storage>(reinterpret_cast<Stage const&>(c3)) << 16);
-    result |= (static_cast<Storage>(reinterpret_cast<Stage const&>(c2)) << 8);
-    result |= static_cast<Storage>(reinterpret_cast<Stage const&>(c1));
-    return result;
-  }
-  CUTLASS_HOST_DEVICE
-  T get() const {
-    auto stage = static_cast<Stage>(storage[0] >> 8);
-    #if defined(__CUDA_ARCH__)
-    return reinterpret_cast<T const&>(stage);
-    #else
-    T tmp;
-    std::memcpy(&tmp, &stage, sizeof(Stage));
-    return tmp;
-    #endif
-  }
-  CUTLASS_HOST_DEVICE
-  T get(int idx) const {
-    Stage stage;
-    if (idx < 4) stage = static_cast<Stage>(storage[0] >> (8 * idx));
-    else         stage = static_cast<Stage>(storage[1] >> (8 * idx - 32));
-    #if defined(__CUDA_ARCH__)
-    return reinterpret_cast<T const&>(stage);
-    #else
-    T tmp;
-    std::memcpy(&tmp, &stage, sizeof(Stage));
-    return tmp;
-    #endif
-  }
-};
-// In the mainloop, PRMT selects 1 byte from only 8 bytes so the sign bit is handled in an extra PRMT.
-// Here the encodings of positive values and negative values are unified (except for the sign bit).
-// For instance, 1 becomes 0b0111, which is the same encoding as -1 (0b1111).
-static bool unified_encode_int4b(cutlass::int4b_t const *block_in, cutlass::int4b_t *block_out, const size_t block_size) {
-  using StorageType = cutlass::int4b_t::Storage;
-  constexpr int pack = cute::sizeof_bits_v<StorageType> / 4;
-  const size_t host_buf_size = block_size / pack;
-  std::vector<StorageType> host_buf(host_buf_size);
-  cutlass::device_memory::copy_to_host(host_buf.data(), (StorageType *) block_in, host_buf_size);
-  for (auto&& d : host_buf) {
-    StorageType out = 0;
-    StorageType mask = 0x0f;
-    for (int i = 0; i < pack; i++) {
-      cutlass::int4b_t curr;
-      curr.storage = (d >> (i * 4)) & 0x0f;
-      switch (curr) {
-        case 1: curr.storage = StorageType(0b0111); break; // 2's complement
-        case 2: curr.storage = StorageType(0b0110); break; // 2's complement
-        case 3: curr.storage = StorageType(0b0101); break; // 2's complement
-        case 4: curr.storage = StorageType(0b0100); break; // 2's complement
-        case 5: curr.storage = StorageType(0b0011); break; // 2's complement
-        case 6: curr.storage = StorageType(0b0010); break; // 2's complement
-        case 7: curr.storage = StorageType(0b0001); break; // 2's complement
-        default: break;
-      }
-      out |= (curr.storage << (4 * i)) & mask;
-      mask <<= 4;
-    }
-    d = out;
-  }
-  cutlass::device_memory::copy_to_device((StorageType*) block_out, host_buf.data(), host_buf_size);
-  return true;
-}
-template <class ElementScale>
-static bool pack_scale_fp8(ElementScale const *block_in, cutlass::Array<ElementScale, 8> *block_out, const size_t block_size) {
-  std::vector<ElementScale> data_in(block_size);
-  std::vector<cutlass::Array<ElementScale, 8>> data_out(block_size);
-  try {
-    cutlass::device_memory::copy_to_host(data_in.data(), block_in, block_size);
-  }
-  catch (cutlass::cuda_exception const& e) {
-    std::cerr << "CUDA Error: " << cudaGetErrorString(e.cudaError()) << std::endl;
-    return false;
-  }
-  for (size_t i = 0; i < block_size; i++) {
-    cutlass::packed_scale_t<ElementScale> tmp(data_in[i]);
-    data_out[i] = reinterpret_cast<cutlass::Array<ElementScale, 8> const&>(tmp);
-  }
-  try {
-    cutlass::device_memory::copy_to_device(block_out, data_out.data(), block_size);
-  }
-  catch (cutlass::cuda_exception const& e) {
-    std::cerr << "CUDA Error: " << cudaGetErrorString(e.cudaError()) << std::endl;
-    return false;
-  }
-  return true;
-}
-template <class T, class = void>
-struct UnderlyingElement {
-  using type = T;
-};
-template <class T>
-struct UnderlyingElement<T, cute::void_t<typename T::Element>> {
-  using type = typename T::Element;
-};
-// Given a type of MMA instruction, compute a memory reordering atom that places all values
-// owned by each thread in contiguous memory locations. This improves smem load vectorization,
-// particularly for mixed dtype GEMMs where a narrow type is loaded in the thread/value order
-// of the wider type and may result in inefficient sub-bank (8-bit or 16-bit) accesses.
-// In addition, we can reorder the values across several MMA instructions to get even wider
-// vectorization (AtomLayout parameter) and permute the values within each instruction to get
-// more optimal conversion instruction sequences (ValLayout parameter).
-template <class ElementMma,
-         class AtomLayout = cute::Layout<cute::_1>,
-         class ValLayout  = cute::Layout<cute::_1>>
-constexpr auto compute_memory_reordering_atom(AtomLayout atom_layout = {}, ValLayout val_layout = {})
-{
-  using namespace cute;
-  static_assert(is_static_v<ValLayout>, "ValLayout must be static");
-  static_assert(is_static_v<AtomLayout>, "AtomLayout must be static");
-  // 1. Choose an MMA atom to access TV layout and MN shape
-  // Note: parameters like GMMA Major, TileShape, ElementC don't affect TV layout of A, use arbitrary
-  using MmaAtom = decltype(SM90::GMMA::rs_op_selector<ElementMma, ElementMma, float, Shape<_64,_16,_32>>());
-  using MmaTraits = MMA_Traits<MmaAtom>;
-  auto mk_shape_mma = select<0,2>(typename MmaTraits::Shape_MNK{});
-  auto tv_layout_mma = typename MmaTraits::ALayout{};
-  static_assert(size<1>(tv_layout_mma) % size(val_layout) == 0, "Value layout must evenly divide the MMA value layout");
-  // 2. Create a single warp's TV layout from that of the whole MMA and invert to get (m,k -> thr,val)
-  // Note: this assumes A is partitioned between warps along M mode
-  auto tv_tiler_warp = make_shape(Int<32>{}, size<1>(tv_layout_mma));
-  auto mk_shape_warp = shape_div(mk_shape_mma, size(typename MmaTraits::ThrID{}) / Int<32>{});
-  auto tv_layout_mma_warp = make_layout_like(composition(tv_layout_mma, tv_tiler_warp));
-  auto mk_layout_mma_warp = right_inverse(tv_layout_mma_warp).with_shape(mk_shape_warp);
-  // 3. Repeat the warp layout NumAtoms times along K mode to get wider vectorization
-  auto mk_layout_mma_trgt = blocked_product(mk_layout_mma_warp, atom_layout);
-  // 4. Compose with a contiguous layout of values in each thread (required for smem vectorization)
-  auto val_to_offset = logical_product(val_layout, size<1>(tv_layout_mma) / size(val_layout) * size(atom_layout));
-  auto thr_to_offset = make_layout(size<0>(tv_layout_mma_warp));
-  auto tv_to_offset = select<1,0>(logical_product(val_to_offset, thr_to_offset));
-  auto layout_atom = composition(tv_to_offset, mk_layout_mma_trgt);
-  return layout_atom;
-}
-template <class TileShape, class EngineSrc, class LayoutSrc, class EngineDst, class LayoutDst, class TiledCopy>
-__global__ void reorder_tensor_kernel(
-  cute::Tensor<EngineSrc, LayoutSrc> S,
-  cute::Tensor<EngineDst, LayoutDst> D,
-  TiledCopy tiled_copy)
-{
-  using namespace cute;
-  using T = typename EngineDst::value_type;
-  Tensor gS = local_tile(S, TileShape{}, make_coord(blockIdx.x, _, blockIdx.z));
-  Tensor gD = local_tile(D, TileShape{}, make_coord(blockIdx.x, _, blockIdx.z));
-  auto thread_copy = tiled_copy.get_slice(threadIdx.x);
-  Tensor tS = thread_copy.partition_S(gS);
-  Tensor tD = thread_copy.partition_D(gD);
-  copy(tiled_copy, tS, tD);
-}
-template <class EngineSrc, class LayoutSrc, class EngineDst, class LayoutDst>
-void reorder_tensor(
-  cute::Tensor<EngineSrc, LayoutSrc> S,
-  cute::Tensor<EngineDst, LayoutDst> D)
-{
-  using namespace cute;
-  using T = typename EngineDst::value_type;
-  static_assert(is_same_v<remove_const_t<typename EngineSrc::value_type>, T>, "Type mismatch");
-  // Construct a value layout that assigns at least 8 bits of contiguous elements in destination tensor to a thread
-  // This avoids a race condition when writing out subbyte types (e.g. int4b_t).
-  auto has_major_mode = [](auto s) {
-    return any_of(flatten(s), [](auto a){ return is_constant<1, decltype(a)>{}; });
-  };
-  static_assert(has_major_mode(stride<0>(LayoutDst{})) ^ has_major_mode(stride<1>(LayoutDst{})),
-                "Could not find stride-1 mode in destination layout");
-  constexpr int N = shape_div(Int<8>{}, Int<sizeof_bits_v<T>>{});
-  auto val_layout = conditional_return<has_major_mode(stride<0>(LayoutDst{}))>(
-    make_layout(make_shape(Int<N>{}, Int<1>{}), GenColMajor{}),
-    make_layout(make_shape(Int<1>{}, Int<N>{}), GenRowMajor{}));
-  // Make a tiled copy with a simple row-major thread order and above layout
-  int constexpr NumThreads = 128;
-  auto const thr_layout = make_layout(make_shape(Int<1>{}, Int<NumThreads>{}));
-  auto tiled_copy = make_tiled_copy(Copy_Atom<DefaultCopy, T>{}, thr_layout, val_layout);
-  // Assign a group of 16 rows to a threadblock; this matches the shuffle atom size for Hopper
-  using TileShape = Shape<_16>;
-  auto tiled_D = group_modes<3,rank_v<LayoutDst>>(tiled_divide(D, TileShape{}));
-  dim3 blocks{unsigned(size<1>(tiled_D)), 1u, unsigned(size<3>(tiled_D))};
-  reorder_tensor_kernel<TileShape><<<blocks, NumThreads>>>(S, D, tiled_copy);
-  CUDA_CHECK(cudaDeviceSynchronize());
-}
-// In-place version
-template <class T, class LayoutSrc, class LayoutDst>
-void reorder_tensor(
-  T const* src,
-  LayoutSrc const& layout_src,
-  T * dst,
-  LayoutDst const& layout_dst)
-{
-  using namespace cute;
-  reorder_tensor(make_tensor(make_gmem_ptr<T>(src), layout_src),
-                 make_tensor(make_gmem_ptr<T>(dst), layout_dst));
-}
-// In-place version
-template <class T, class LayoutSrc, class LayoutDst>
-void reorder_tensor(
-  T * data,
-  LayoutSrc const& layout_src,
-  LayoutDst const& layout_dst)
-{
-  using namespace cute;
-  cutlass::DeviceAllocation<T> temp(size(layout_src));
-  reorder_tensor(data, layout_src, temp.get(), layout_dst);
-  cutlass::device_memory::copy_device_to_device(data, temp.get(), static_cast<size_t>(size(layout_src)));
-}
-#undef CUDA_CHECK
-}  // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/packed_stride.hpp DELETED Viewed

@@ -1,570 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Utilities for packing constructing canonical CuTe stride types for 3.x mainloop params.
-*/
-#pragma once
-#include "cute/layout.hpp"
-#include "cute/container/array.hpp"   // cute::array
-#include "cutlass/conv/convolution.h" // cutlass::conv::Operator
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Strides without batch mode
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Int<1>>
-make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
-  return s_copy;
-}
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
-  return s_copy;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Strides with batch mode
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Int<1>, int64_t>
-make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
-  int batch_count =  cute::get<2>(shape_MKL);
-  if (batch_count > 1) {
-    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
-  }
-  else {
-    cute::get<2>(s_copy) = static_cast<IntT>(0);
-  }
-  return s_copy;
-}
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT, int64_t>
-make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
-  int batch_count =  cute::get<2>(shape_MKL);
-  if (batch_count > 1) {
-    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
-  }
-  else {
-    cute::get<2>(s_copy) = static_cast<IntT>(0);
-  }
-  return s_copy;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Strides with group mode
-template <class StrideIntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>>
-make_cute_packed_stride(cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<StrideIntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<0>(s_copy) = static_cast<StrideIntT>(cute::get<1>(shape_MKL));
-  return s_copy;
-}
-template <class StrideIntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>>
-make_cute_packed_stride(cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<StrideIntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<1>(s_copy) = static_cast<StrideIntT>(cute::get<0>(shape_MKL));
-  return s_copy;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Strides for convolutions
-// Output cutlass::layout::TensorNDHWC -> rank-3 stride (InT,_1,_0)
-// Note: For fprop/dgrad kernel, strides are assumed to be layout right in NZPQK/NDHWC order
-// and therefore can be coalesced to just q/w. For wgrad kernel, strides are assumed to be layout
-// right in KTRSC order and can be coalesced to just k.
-// We enforce this condition here with asserts.
-template <class IntT, size_t RankT_>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Int<1>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Int<1>, cute::Int<0>> s,
-    cute::array<int32_t, RankT_> shape_output,
-    cute::array<IntT, RankT_> stride_output,
-    cutlass::conv::Operator conv_op) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  static_assert(RankT_ >= 3u);
-  constexpr static int RankT = static_cast<int>(RankT_);
-  assert(stride_output[RankT-1] == 1);
-  cute::for_each(cute::make_seq<RankT-2>{}, [&](auto i) {
-    assert(stride_output[i] == shape_output[i+1] * stride_output[i+1]);
-  });
-  auto s_copy = s;
-  cute::get<0>(s_copy) = (conv_op == cutlass::conv::Operator::kWgrad) ?
-      stride_output[0] :
-      stride_output[RankT-2];
-  return s_copy;
-}
-//
-// Activation tensor ((w, h, d, n), _1) for fprop kernel
-//
-// Activation cutlass::layout::TensorNWC -> rank-2 stride ((W,N),_1)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>> s,
-    cute::array<IntT, 3> stride_nwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_nwc[2] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_nwc[1];
-  cute::get<0,1>(s_copy) = stride_nwc[0];
-  return s_copy;
-}
-// Activation cutlass::layout::TensorNHWC -> rank-2 stride ((W,H,N),_1)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>> s,
-    cute::array<IntT, 4> stride_nhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_nhwc[3] == 1);
-  auto s_copy = s;
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<0,i>(s_copy) = stride_nhwc[2-i];
-  });
-  return s_copy;
-}
-// Activation cutlass::layout::TensorNDHWC -> rank-2 stride ((W,H,D,N),_1)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>> s,
-    cute::array<IntT, 5> stride_ndhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_ndhwc[4] == 1);
-  auto s_copy = s;
-  cute::for_each(cute::make_seq<4>{}, [&](auto i) {
-    cute::get<0,i>(s_copy) = stride_ndhwc[3-i];
-  });
-  return s_copy;
-}
-//
-// Filter tensor (k, (_1, s, r, t)) for fprop kernel
-//
-// Filter cutlass::layout::TensorNWC -> rank-2 stride (k, (_1, s))
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>> s,
-    cute::array<IntT, 3> stride_ksc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_ksc[2] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ksc[0];
-  cute::get<1,1>(s_copy) = stride_ksc[1];
-  return s_copy;
-}
-// Filter cutlass::layout::TensorNHWC -> rank-2 stride (k, (_1, s, r))
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>> s,
-    cute::array<IntT, 4> stride_krsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_krsc[3] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_krsc[0];
-  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
-  });
-  return s_copy;
-}
-// Filter cutlass::layout::TensorNDHWC -> rank-2 stride (k, (_1, s, r, t))
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>> s,
-    cute::array<IntT, 5> stride_ktrsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_ktrsc[4] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ktrsc[0];
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
-  });
-  return s_copy;
-}
-//
-// Activation tensor (_1, (w, h, d, n)) for wgrad kernel
-//
-// It is also Filter tensor ((_1), (k, s, r, t)) for dgrad kernel
-//
-// Activation cutlass::layout::TensorNWC -> rank-2 stride (_1, (W,N)) in wgrad
-// Filter cutlass::layout::TensorNWC -> rank-2 stride ((_1), (k, s)) in dgrad
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>> s,
-    cute::array<IntT, 3> stride_nwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_nwc[2] == 1);
-  auto s_copy = s;
-  if (ConvOp == cutlass::conv::Operator::kWgrad) {
-    cute::get<1,0>(s_copy) = stride_nwc[1];
-    cute::get<1,1>(s_copy) = stride_nwc[0];
-  }
-  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
-    // stride_nwc in dgrad is ksc.
-    cute::get<1,0>(s_copy) = stride_nwc[0];
-    cute::get<1,1>(s_copy) = stride_nwc[1];
-  }
-  return s_copy;
-}
-// Activation cutlass::layout::TensorNHWC -> rank-2 stride (_1, (W,H,N)) in wgrad
-// Filter cutlass::layout::TensorNHWC -> rank-2 stride ((_1), (k, s, r)) in dgrad
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>> s,
-    cute::array<IntT, 4> stride_nhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_nhwc[3] == 1);
-  auto s_copy = s;
-  if (ConvOp == cutlass::conv::Operator::kWgrad) {
-    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-      cute::get<1,i>(s_copy) = stride_nhwc[2-i];
-    });
-  }
-  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
-    // stride_nhwc in dgrad is krsc.
-    cute::get<1,0>(s_copy) = stride_nhwc[0];
-    cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-      cute::get<1,2-i>(s_copy) = stride_nhwc[i+1];
-    });
-  }
-  return s_copy;
-}
-// Activation cutlass::layout::TensorNDHWC -> rank-2 stride (_1, (W,H,D,N)) in wgrad
-// Filter cutlass::layout::TensorNDHWC -> rank-2 stride ((_1), (k, s, r, t)) in dgrad
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>> s,
-    cute::array<IntT, 5> stride_ndhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_ndhwc[4] == 1);
-  auto s_copy = s;
-  if (ConvOp == cutlass::conv::Operator::kWgrad) {
-    cute::for_each(cute::make_seq<4>{}, [&](auto i) {
-      cute::get<1,i>(s_copy) = stride_ndhwc[3-i];
-    });
-  }
-  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
-    // stride_ndhwc in dgrad is ktrsc.
-    cute::get<1,0>(s_copy) = stride_ndhwc[0];
-    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-      cute::get<1,3-i>(s_copy) = stride_ndhwc[i+1];
-    });
-  }
-  return s_copy;
-}
-//
-// NZPQ tensor (_1, nzpq) for wgrad kernel
-//
-// cutlass::layout::TensorNWC -> rank-2 stride (_1, nzpq)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, IntT> s,
-    cute::array<IntT, 3> stride_nqk,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_nqk[2] == 1);
-  auto s_copy = s;
-  cute::get<1>(s_copy) = stride_nqk[1];
-  return s_copy;
-}
-// cutlass::layout::TensorNHWC -> rank-2 stride (_1, nzpq)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, IntT> s,
-    cute::array<IntT, 4> stride_npqk,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_npqk[3] == 1);
-  auto s_copy = s;
-  cute::get<1>(s_copy) = stride_npqk[2];
-  return s_copy;
-}
-// cutlass::layout::TensorNDHWC -> rank-2 stride (_1, nzpq)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, IntT> s,
-    cute::array<IntT, 5> stride_nzpqk,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_nzpqk[4] == 1);
-  auto s_copy = s;
-  cute::get<1>(s_copy) = stride_nzpqk[3];
-  return s_copy;
-}
-//
-// Wgrad output tensor (k, (_1, s, r, t), _0)
-//
-// Filter cutlass::layout::TensorKCS -> rank-3 stride (k, (_1, s), _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
-    cute::array<IntT, 3> stride_ksc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_ksc[2] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ksc[0];
-  cute::get<1,1>(s_copy) = stride_ksc[1];
-  return s_copy;
-}
-// Filter cutlass::layout::TensorKCSR -> rank-3 stride (k, (_1, s, r), _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
-    cute::array<IntT, 4> stride_krsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_krsc[3] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_krsc[0];
-  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
-  });
-  return s_copy;
-}
-// Filter cutlass::layout::TensorKCSRT -> rank-3 stride (k, (_1, s, r, t), _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
-    cute::array<IntT, 5> stride_ktrsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_ktrsc[4] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ktrsc[0];
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
-  });
-  return s_copy;
-}
-//
-// Wgrad output tensor ((_1, s, r, t), k, _0)
-//
-// Filter cutlass::layout::TensorCSK -> rank-3 stride ((_1, s), k, _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
-    cute::array<IntT, 3> stride_ksc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_ksc[2] == 1);
-  auto s_copy = s;
-  cute::get<1,0>(s_copy) = stride_ksc[0];
-  cute::get<0,1>(s_copy) = stride_ksc[1];
-  return s_copy;
-}
-// Filter cutlass::layout::TensorCSRK -> rank-3 stride ((_1, s, r), k, _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
-    cute::array<IntT, 4> stride_krsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_krsc[3] == 1);
-  auto s_copy = s;
-  cute::get<1,0>(s_copy) = stride_krsc[0];
-  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-    cute::get<0,2-i>(s_copy) = stride_krsc[i+1];
-  });
-  return s_copy;
-}
-// Filter cutlass::layout::TensorCSRTK -> rank-3 stride ((_1, s, r, t), k, _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
-    cute::array<IntT, 5> stride_ktrsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_ktrsc[4] == 1);
-  auto s_copy = s;
-  cute::get<1,0>(s_copy) = stride_ktrsc[0];
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<0,3-i>(s_copy) = stride_ktrsc[i+1];
-  });
-  return s_copy;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/print_error.hpp DELETED Viewed

@@ -1,341 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include <array>
-#include <cassert>
-#include <cmath>
-#include <iostream>
-#include <type_traits>
-#include <cute/util/type_traits.hpp>
-#include <cute/tensor.hpp>
-#include <cute/numeric/numeric_types.hpp>
-#include <cute/numeric/complex.hpp>
-#include <cutlass/layout/layout.h>
-// The computed infinity norm does not include
-// any NaN column absolute-value sums.
-struct matrix_inf_norm_result {
-  // Accumulate errors in double, as this is generally
-  // the highest precision that the examples use.
-  double inf_norm = 0.0;
-  bool found_nan = false;
-};
-// In theory, cute::Tensor<ViewEngine<T*>, T> could be treated as a view type,
-// and thus passed by value (as std::span or std::string_view would be).
-// However, generic cute::Tensor are more like containers
-// and thus are best passed by reference or const reference.
-template <typename EngineType, typename LayoutType>
-matrix_inf_norm_result
-matrix_inf_norm(cute::Tensor<EngineType, LayoutType> const& host_matrix)
-{
-  using error_type = decltype(std::declval<matrix_inf_norm_result>().inf_norm);
-  using element_type = typename EngineType::value_type;
-  error_type inf_norm = 0.0;
-  bool found_nan = false;
-  // Computing the infinity norm requires that we be able
-  // to treat the input as a matrix, with rows and columns.
-  const int64_t num_rows = cute::size<0>(host_matrix);
-  const int64_t num_cols = cute::size<1>(host_matrix);
-  auto abs_fn = [] (element_type A_ij) {
-    if constexpr (not std::is_unsigned_v<element_type>) {
-      using std::abs;
-      return abs(A_ij);
-    }
-    else {
-      return A_ij;
-    }
-  };
-  for (int64_t i = 0; i < num_rows; ++i) {
-    error_type row_abs_sum = 0.0;
-    for(int64_t j = 0; j < num_cols; ++j) {
-      row_abs_sum += abs_fn(host_matrix(i, j));
-    }
-    if (std::isnan(row_abs_sum)) {
-      found_nan = true;
-    }
-    else {
-      inf_norm = row_abs_sum > inf_norm ? row_abs_sum : inf_norm;
-    }
-  }
-  return {inf_norm, found_nan};
-}
-// Infinity norm of (X - Y).
-template <typename EngineType, typename LayoutType>
-matrix_inf_norm_result
-matrix_diff_inf_norm(cute::Tensor<EngineType, LayoutType> const& X,
-                     cute::Tensor<EngineType, LayoutType> const& Y)
-{
-  using error_type = decltype(std::declval<matrix_inf_norm_result>().inf_norm);
-  using element_type = typename EngineType::value_type;
-  auto abs_fn = [] (element_type A_ij) {
-    if constexpr (not std::is_unsigned_v<element_type>) {
-      using std::abs;
-      return abs(A_ij);
-    }
-    else {
-      return A_ij;
-    }
-  };
-  assert(cute::size<0>(X) == cute::size<0>(Y));
-  assert(cute::size<1>(X) == cute::size<1>(Y));
-  // Computing the infinity norm requires that we be able
-  // to treat the input as a matrix, with rows and columns.
-  const int64_t num_rows = cute::size<0>(X);
-  const int64_t num_cols = cute::size<1>(X);
-  error_type inf_norm = 0.0;
-  bool found_nan = false;
-  for (int64_t i = 0; i < num_rows; ++i) {
-    error_type row_abs_sum = 0.0;
-    for (int64_t j = 0; j < num_cols; ++j) {
-      row_abs_sum += error_type(abs_fn(element_type(X(i,j)) -
-                                       element_type(Y(i,j))));
-    }
-    if (std::isnan(row_abs_sum)) {
-      found_nan = true;
-    }
-    else {
-      inf_norm = row_abs_sum > inf_norm ? row_abs_sum : inf_norm;
-    }
-  }
-  return {inf_norm, found_nan};
-}
-template <typename EngineType_A, typename LayoutType_A,
-          typename EngineType_B, typename LayoutType_B,
-          typename EngineType_C, typename LayoutType_C,
-          typename EngineType_C_ref, typename LayoutType_C_ref>
-auto
-print_matrix_multiply_mollified_relative_error(
-  char const A_value_type_name[],
-  cute::Tensor<EngineType_A, LayoutType_A> const& A,
-  char const B_value_type_name[],
-  cute::Tensor<EngineType_B, LayoutType_B> const& B,
-  char const C_value_type_name[],
-  cute::Tensor<EngineType_C, LayoutType_C> const& C,
-  cute::Tensor<EngineType_C_ref, LayoutType_C_ref> const& C_ref)
-{
-  const auto [A_norm, A_has_nan] = matrix_inf_norm(A);
-  const auto [B_norm, B_has_nan] = matrix_inf_norm(B);
-  const auto [C_norm, C_has_nan] = matrix_inf_norm(C_ref);
-  const auto [diff_norm, diff_has_nan] = matrix_diff_inf_norm(C, C_ref);
-  const auto A_norm_times_B_norm = A_norm * B_norm;
-  const auto relative_error = A_norm_times_B_norm == 0.0 ?
-    diff_norm : (diff_norm / A_norm_times_B_norm);
-  // For expected error bounds, please refer to the LAPACK Users' Guide,
-  // in particular https://netlib.org/lapack/lug/node108.html .
-  // Printing the infinity norm of C is a way to check
-  // that both the function being tested (C)
-  // and the reference implementation (C_ref)
-  // don't just do nothing (or fill with zeros).
-  using std::cout;
-  using cute::shape;
-  cout << "Matrix A: " << shape<0>(A) << "x" << shape<1>(A) << " of " << A_value_type_name << '\n'
-      << "Matrix B: " << shape<0>(B) << "x" << shape<1>(B) << " of " << B_value_type_name << '\n'
-      << "Matrix C: " << shape<0>(C) << "x" << shape<1>(C) << " of " << C_value_type_name << '\n'
-      << std::scientific
-      << "Infinity norm of A: " << A_norm << '\n'
-      << "Infinity norm of B: " << B_norm << '\n'
-      << "Infinity norm of C: " << C_norm << '\n'
-      << "Infinity norm of (C - C_ref): " << diff_norm << '\n';
-  if(A_norm_times_B_norm == 0.0) {
-    cout << "Mollified relative error: " << relative_error << '\n';
-  } else {
-    cout << "Relative error: " << relative_error << '\n';
-  }
-  if (A_has_nan || B_has_nan || C_has_nan || diff_has_nan) {
-    cout << "Did we encounter NaN in A? " << (A_has_nan ? "yes" : "no") << '\n'
-        << "Did we encounter NaN in B? " << (B_has_nan ? "yes" : "no") << '\n'
-        << "Did we encounter NaN in C? " << (C_has_nan ? "yes" : "no") << '\n'
-        << "Did we encounter NaN in (C - C_ref)? " << (diff_has_nan ? "yes" : "no") << '\n';
-  }
-  return relative_error;
-}
-template <typename EngineType, typename LayoutType>
-auto
-print_matrix_multiply_mollified_relative_error(
-  const char value_type_name[],
-  const cute::Tensor<EngineType, LayoutType>& A,
-  const cute::Tensor<EngineType, LayoutType>& B,
-  const cute::Tensor<EngineType, LayoutType>& C_computed,
-  const cute::Tensor<EngineType, LayoutType>& C_expected)
-{
-  return print_matrix_multiply_mollified_relative_error(value_type_name, A, value_type_name, B,
-                                                 value_type_name, C_computed, C_expected);
-}
-// Take a CUTLASS HostTensor (or the like) as input,
-// and return a const CuTe Tensor.
-// This is useful for use with the above error printing functions.
-// This implicitly "transposes" if the layout is RowMajor.
-// Note that the HostTensor must be captured by nonconst reference
-// in order for X.host_ref().data() to compile.
-// (CUTLASS is a bit more container-y than CuTe.)
-template<class CutlassHostTensorType>
-auto host_matrix_to_const_cute_tensor(CutlassHostTensorType& X)
-{
-  // The tensors were created with post-transposed extents.
-  const auto extents = X.extent();
-  const auto shape = cute::Shape<int, int>{extents[0], extents[1]};
-  // Both RowMajor and ColumnMajor only store one stride.
-  const int LDX = X.stride(0);
-  const auto strides = [&]() {
-      using input_layout_type = typename std::decay_t<decltype(X)>::Layout;
-      if constexpr (std::is_same_v<input_layout_type, cutlass::layout::ColumnMajor>) {
-        return cute::Stride<int, int>{1, LDX};
-      }
-      else {
-        static_assert(std::is_same_v<input_layout_type, cutlass::layout::RowMajor>);
-        return cute::Stride<int, int>{LDX, 1};
-      }
-    }();
-  const auto layout = cute::make_layout(shape, strides);
-  auto X_data = X.host_ref().data();
-  auto X_data_const = const_cast<std::add_const_t< decltype(X_data)> >(X_data);
-  return cute::make_tensor(X_data_const, layout);
-};
-// Returns EXIT_SUCCESS if the 2-norm relative error is exactly zero, else returns EXIT_FAILURE.
-// This makes the return value suitable as the return value of main().
-template <typename T1, typename T2>
-int
-print_relative_error(
-    std::size_t n,
-    T1 const& data,
-    T2 const& reference,
-    bool print_verbose = false,
-    bool print_error = true,
-    double error_margin = 0.00001) {
-  using std::abs; using std::sqrt;
-  // Use either double or complex<double> for error computation
-  using value_type = cute::remove_cvref_t<decltype(reference[0])>;
-  using error_type = std::conditional_t<cute::is_complex<value_type>::value,
-                                        cute::complex<double>,
-                                        double>;
-  if (print_verbose) {
-    std::cout << "Idx:\t"<< "Val\t" << "RefVal\t" << "RelError" << std::endl;
-  }
-  double eps = 1e-200;
-  double tot_error_sq = 0;
-  double tot_norm_sq = 0;
-  double tot_ind_rel_err = 0;
-  double max_ind_rel_err = 0;
-  double max_diff = 0;
-  for (std::size_t i = 0; i < n; ++i) {
-    error_type val = data[i];
-    error_type ref = reference[i];
-    double aref = abs(ref);
-    double diff = abs(ref - val);
-    double rel_error = diff / (aref + eps);
-    // Individual relative error
-    tot_ind_rel_err += rel_error;
-    // Maximum relative error
-    max_ind_rel_err  = std::max(max_ind_rel_err, rel_error);
-    // Maximum delta in value error
-    max_diff = std::max(max_diff, diff);
-    // Total relative error
-    tot_error_sq += diff * diff;
-    tot_norm_sq  += aref * aref;
-    if (print_verbose) {
-      std::cout << i << ":\t" << val << "\t" << ref << "\t" << rel_error << std::endl;
-    }
-  }
-  double ave_rel_err = tot_ind_rel_err / double(n);
-  if (print_error) {
-    printf("Average relative error: %.3e\n", ave_rel_err);
-  }
-  if (print_error) {
-    printf("Maximum relative error: %.3e\n", max_ind_rel_err);
-  }
-  if (print_error) {
-    printf("Maximum difference    : %.3e\n", max_diff);
-  }
-  double tot_rel_err = sqrt(tot_error_sq/(tot_norm_sq+eps));
-  if (print_error) {
-    printf("Vector relative error:  %.3e\n", tot_rel_err);
-  }
-  printf("Vector reference  norm: %.3e\n", sqrt(tot_norm_sq));
-  return (tot_rel_err <= error_margin) ? EXIT_SUCCESS : EXIT_FAILURE;
-}
-// Overload for cute::Tensor<>
-template <class Engine, class Layout>
-int
-print_relative_error(
-    cute::Tensor<Engine, Layout> data,
-    cute::Tensor<Engine, Layout> reference,
-    bool print_verbose = false,
-    bool print_error = true,
-    double error_margin = 0.00001) {
-  assert(size(data) == size(reference));
-  return print_relative_error(static_cast<std::size_t>(size(data)),
-                              data, reference,
-                              print_verbose, print_error, error_margin);
-}

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/detail/inner_product.h DELETED Viewed

@@ -1,135 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-namespace cutlass {
-namespace reference {
-namespace detail {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Template function to compute an inner product.
-#pragma hd_warning_disable  // Suppresses warnings when attempting to instantiate with a
-                            // host-only type
-template <typename Atype, typename Btype, typename Ctype>
-CUTLASS_HOST_DEVICE
-Ctype inner_product(Atype a, Btype b, Ctype c) {
-  return Ctype(a) * Ctype(b) + c;
-}
-/// Specialization for matrix multiplication with binary operands
-template <>
-CUTLASS_HOST_DEVICE
-int inner_product<Array<bin1_t, 32>, Array<bin1_t, 32>, int>(
-    Array<bin1_t, 32> a,
-    Array<bin1_t, 32> b,
-    int c) {
-  int accum = 0;
-  for (int bit = 0; bit < 32; bit++) {
-    accum += a[bit] ^ b[bit];
-  }
-  return accum + c;
-}
-/*
-/// Specialization for matrix multiplication with signed 4-bit integer operands
-template <>
-CUTLASS_HOST_DEVICE
-int inner_product<Array<int4b_t, 8>, Array<int4b_t, 8>, int>(
-    Array<int4b_t, 8> a,
-    Array<int4b_t, 8> b,
-    int c) {
-  int accum = 0;
-  for (int k = 0; k < 8; k++) {
-    accum += a[k] * b[k];
-  }
-  return accum + c;
-}
-/// Specialization for matrix multiplication with unsigned 4-bit integer operands
-template <>
-CUTLASS_HOST_DEVICE
-int inner_product<Array<uint4b_t, 8>, Array<uint4b_t, 8>, int>(
-    Array<uint4b_t, 8> a,
-    Array<uint4b_t, 8> b,
-    int c) {
-  int accum = 0;
-  for (int k = 0; k < 8; k++) {
-    accum += a[k] * b[k];
-  }
-  return accum + c;
-}
-*/
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template <typename SrcType, typename DstType>
-struct Cast {
-  // Default behavior: convert to the destination type
-#pragma hd_warning_disable  // Suppresses warnings when attempting to instantiate complex<T> with a
-                            // host-only type
-  CUTLASS_HOST_DEVICE
-  static DstType apply(SrcType src) { return static_cast<DstType>(src); };
-};
-template <>
-struct Cast<float, int8_t> {
-  CUTLASS_HOST_DEVICE
-  static int8_t apply(float src) {
-    // Clamp to the range of signed 8-bit integers.
-    return static_cast<int8_t>(fmaxf(-128.f, fminf(127.f, src)));
-  };
-};
-template <>
-struct Cast<float, uint8_t> {
-  CUTLASS_HOST_DEVICE
-  static uint8_t apply(float src) {
-    // Clamp to the range of signed 8-bit integers.
-    return static_cast<uint8_t>(fmaxf(0.f, fminf(255.f, src)));
-  };
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace detail
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h DELETED Viewed

@@ -1,94 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace reference {
-namespace detail {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <int Rank, int Index>
-struct LinearToCoordinateHelper {
-  CUTLASS_HOST_DEVICE
-  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &extent) const {
-    int64_t prod = 1;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = Rank - Index; i < Rank; ++i) {
-      prod *= int64_t(extent[i]);
-    }
-    coord[Rank - Index - 1] = int(idx / prod);
-    int64_t residual = idx % prod;
-    LinearToCoordinateHelper<Rank, Index - 1>()(coord, residual, extent);
-  }
-};
-template <int Rank>
-struct LinearToCoordinateHelper<Rank, 0> {
-  CUTLASS_HOST_DEVICE
-  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &) const {
-    coord[Rank - 1] = int(idx);
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template <int Rank>
-struct LinearToCoordinate {
-  CUTLASS_HOST_DEVICE
-  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &extent) const {
-    LinearToCoordinateHelper<Rank, Rank - 1>()(coord, idx, extent);
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace detail
-} // namespace reference
-} // namespace cutlass
-/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/convolution.h DELETED Viewed

@@ -1,1549 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for convolution in device-side code.
-*/
-#pragma once
-#include "cutlass/coord.h"
-#include "cutlass/functional.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-namespace cutlass {
-namespace reference {
-namespace device {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace kernel {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-///                                   Conv2d device reference kernel
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Conv2d Fprop kernel - y = fprop(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
-  int kCtaShapeN = 8      // shape of a threadblock in units of threads
->
-__global__ void Conv2dFprop(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-  int64_t npq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-  int thread_n[kThreadM];
-  int thread_p[kThreadM];
-  int thread_q[kThreadM];
-  // Compute N, P, Q coordinates for each row of a thread's tile
-  int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    int64_t npq = npq_start + m;
-    thread_n[m] = int(npq / PQ);
-    int64_t residual = npq % PQ;
-    thread_p[m] = int(residual / problem_size.Q);
-    thread_q[m] = int(residual % problem_size.Q);
-  }
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-  int c_per_group = problem_size.C / problem_size.groups;
-  int k_per_group = problem_size.K / problem_size.groups;
-  // Compute convolution
-  for (int R = 0; R < problem_size.R; ++R) {
-    for (int S = 0; S < problem_size.S; ++S) {
-      for (int C = 0; C < problem_size.C; ++C) {
-        // Get group id of currnet channel
-        int c_group_idx = C / c_per_group;
-        // Load from activations tensor
-        int filter_r = R;
-        int filter_s = S;
-        if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-          filter_r = problem_size.R - 1 - R;
-          filter_s = problem_size.S - 1 - S;
-        }
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-          int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-          if (thread_n[m] < problem_size.N && h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
-            element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], h, w, C}));
-          }
-          else {
-            element_A[m] = ElementAccumulator();
-          }
-        }
-        // Load from filters tensor
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < kThreadN; ++n) {
-          int thread_k = k_start + n;
-          int k_group_idx = thread_k / k_per_group;
-          if (thread_k < problem_size.K && k_group_idx == c_group_idx) {
-            element_B[n] = ElementAccumulator(tensor_w.at({thread_k, R, S, C % c_per_group}));
-          }
-          else {
-            element_B[n] = ElementAccumulator();
-          }
-        }
-        // Accumulate matrix product
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-          }
-        }
-      }
-    }
-  }
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    if (thread_n[m] < problem_size.N && thread_p[m] < problem_size.P && thread_q[m] < problem_size.Q) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        int thread_k = k_start + n;
-        if (thread_k < problem_size.K) {
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k}));
-          }
-          tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      }
-    }
-  }
-}
-// Conv3d Fprop kernel - y = fprop(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator =  ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
-  int kCtaShapeN = 8      // shape of a threadblock in units of threads
->
-__global__ void Conv3dFprop(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-  int64_t nzpq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-  int thread_n[kThreadM];
-  int thread_z[kThreadM];
-  int thread_p[kThreadM];
-  int thread_q[kThreadM];
-  // Compute N, Z, P, Q coordinates for each row of a thread's tile
-  int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
-  int64_t ZPQ = PQ * problem_size.Z;
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    int64_t nzpq = nzpq_start + m;
-    thread_n[m] = int(nzpq / ZPQ);
-    int64_t residual = nzpq % ZPQ;
-    thread_z[m] = int(residual / PQ);
-    residual = residual % PQ;
-    thread_p[m] = int(residual / problem_size.Q);
-    thread_q[m] = int(residual % problem_size.Q);
-  }
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-  // Compute convolution
-  for (int T = 0; T < problem_size.T; ++T) {
-    for (int R = 0; R < problem_size.R; ++R) {
-      for (int S = 0; S < problem_size.S; ++S) {
-        for (int C = 0; C < problem_size.C; ++C) {
-          // Load from activations tensor
-          int filter_t = T;
-          int filter_r = R;
-          int filter_s = S;
-          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-            filter_t = problem_size.T - 1 - T;
-            filter_r = problem_size.R - 1 - R;
-            filter_s = problem_size.S - 1 - S;
-          }
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            int d = thread_z[m] * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
-            int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-            int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-            if (thread_n[m] < problem_size.N &&
-              d >= 0 && d < problem_size.D &&
-              h >= 0 && h < problem_size.H &&
-              w >= 0 && w < problem_size.W) {
-              element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], d, h, w, C}));
-            }
-            else {
-              element_A[m] = ElementAccumulator();
-            }
-          }
-          // Load from filters tensor
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            int thread_k = k_start + n;
-            if (thread_k < problem_size.K) {
-              element_B[n] = ElementAccumulator(tensor_w.at({thread_k, T, R, S, C}));
-            }
-            else {
-              element_B[n] = ElementAccumulator();
-            }
-          }
-          // Accumulate matrix product
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < kThreadN; ++n) {
-              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-            }
-          }
-        } // for (C)
-      } // for (S)
-    }  // for (R)
-  } // for (T)
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    if (thread_n[m] < problem_size.N &&
-      thread_z[m] < problem_size.Z &&
-      thread_p[m] < problem_size.P &&
-      thread_q[m] < problem_size.Q) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        int thread_k = k_start + n;
-        if (thread_k < problem_size.K) {
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}));
-          }
-          tensor_y_out.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } // for (n)
-    }
-  } // for (m)
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Conv2d dgrad kernel - dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
-  int kCtaShapeN = 8      // shape of a threadblock in units of threads
->
-__global__ void Conv2dDgrad(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-  int64_t nhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-  int thread_n[kThreadM];
-  int thread_h[kThreadM];
-  int thread_w[kThreadM];
-  // Compute N, H, W coordinates for each row of a thread's tile
-  int64_t HW = int64_t(problem_size.H) * problem_size.W;
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    int64_t nhw = nhw_start + m;
-    thread_n[m] = int(nhw / HW);
-    int64_t residual = nhw % HW;
-    thread_h[m] = int(residual / problem_size.W);
-    thread_w[m] = int(residual % problem_size.W);
-  }
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-  // Compute convolution
-  for (int R = 0; R < problem_size.R; ++R) {
-    for (int S = 0; S < problem_size.S; ++S) {
-      for (int K = 0; K < problem_size.K; ++K) {
-        // Load from activations tensor
-        int filter_r = R;
-        int filter_s = S;
-        if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-          filter_r = problem_size.R - 1 - R;
-          filter_s = problem_size.S - 1 - S;
-        }
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h;
-          int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w;
-          element_A[m] = ElementAccumulator();
-          if (p >= 0 && !(p % problem_size.stride_h) && q >= 0 && !(q % problem_size.stride_w)) {
-            p = p / problem_size.stride_h;
-            q = q / problem_size.stride_w;
-            if (thread_n[m] < problem_size.N && p < problem_size.P && q < problem_size.Q) {
-              element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], p, q, K}));
-            }
-          }
-        }
-        // Load from filters tensor
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < kThreadN; ++n) {
-          int thread_c = c_start + n;
-          if (thread_c < problem_size.C) {
-            element_B[n] = ElementAccumulator(tensor_w.at({K, R, S, thread_c}));
-          }
-          else {
-            element_B[n] = ElementAccumulator();
-          }
-        }
-        // Accumulate matrix product
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-          }
-        }
-      }
-    }
-  }
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    if (thread_n[m] < problem_size.N && thread_h[m] < problem_size.H && thread_w[m] < problem_size.W) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        int thread_c = c_start + n;
-        if (thread_c < problem_size.C) {
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_h[m], thread_w[m], thread_c}));
-          }
-          tensor_dx_out.at({thread_n[m], thread_h[m], thread_w[m], thread_c}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      }
-    }
-  }
-}
-// Conv3d dgrad kernel - dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
-  int kCtaShapeN = 8      // shape of a threadblock in units of threads
->
-__global__ void Conv3dDgrad(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-  int64_t ndhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-  int thread_n[kThreadM];
-  int thread_d[kThreadM];
-  int thread_h[kThreadM];
-  int thread_w[kThreadM];
-  // Compute N, H, W coordinates for each row of a thread's tile
-  int64_t HW = int64_t(problem_size.H) * problem_size.W;
-  int64_t DHW = HW * problem_size.D;
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    int64_t ndhw = ndhw_start + m;
-    thread_n[m] = int(ndhw / DHW);
-    int64_t residual = ndhw % DHW;
-    thread_d[m] = int(residual / HW);
-    residual = residual % HW;
-    thread_h[m] = int(residual / problem_size.W);
-    thread_w[m] = int(residual % problem_size.W);
-  }
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-  // Compute convolution
-  for (int T = 0; T < problem_size.T; ++T) {
-    for (int R = 0; R < problem_size.R; ++R) {
-      for (int S = 0; S < problem_size.S; ++S) {
-        for (int K = 0; K < problem_size.K; ++K) {
-          // Load from activations tensor
-          int filter_t = T;
-          int filter_r = R;
-          int filter_s = S;
-          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-            filter_t = problem_size.T - 1 - T;
-            filter_r = problem_size.R - 1 - R;
-            filter_s = problem_size.S - 1 - S;
-          }
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            int z = thread_d[m] + problem_size.pad_d - filter_t * problem_size.dilation_d;
-            int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h;
-            int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w;
-            element_A[m] = ElementAccumulator();
-            if (z >= 0 && !(z % problem_size.stride_d) &&
-              p >= 0 && !(p % problem_size.stride_h) &&
-              q >= 0 && !(q % problem_size.stride_w)) {
-              z = z / problem_size.stride_d;
-              p = p / problem_size.stride_h;
-              q = q / problem_size.stride_w;
-              if (thread_n[m] < problem_size.N && z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
-                element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], z, p, q, K}));
-              }
-            }
-          }
-          // Load from filters tensor
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            int thread_c = c_start + n;
-            if (thread_c < problem_size.C) {
-              element_B[n] = ElementAccumulator(tensor_w.at({K, T, R, S, thread_c}));
-            }
-            else {
-              element_B[n] = ElementAccumulator();
-            }
-          }
-          // Accumulate matrix product
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < kThreadN; ++n) {
-              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-            }
-          }
-        } // for (C)
-      } // for (S)
-    } // for (R)
-  } // for (T)
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    if (thread_n[m] < problem_size.N &&
-      thread_d[m] < problem_size.D &&
-      thread_h[m] < problem_size.H &&
-      thread_w[m] < problem_size.W) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        int thread_c = c_start + n;
-        if (thread_c < problem_size.C) {
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}));
-          }
-          tensor_dx_out.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      }
-    }
-  }
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Conv2d wgrad kernel - dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 8,     // shape of a threadblock in units of threads
-  int kCtaShapeN = 16     // shape of a threadblock in units of threads
->
-__global__ void Conv2dWgrad(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-  int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int64_t rsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-  int thread_r[kThreadN];
-  int thread_s[kThreadN];
-  int thread_c[kThreadN];
-  // Compute R, S, C coordinates for each row of a thread's tile
-  int64_t SC = int64_t(problem_size.S) * problem_size.C;
-  CUTLASS_PRAGMA_UNROLL
-  for (int n = 0; n < kThreadN; ++n) {
-    int64_t rsc = rsc_start + n;
-    int64_t residual = rsc % SC;
-    thread_r[n] = int(rsc / SC);
-    thread_s[n] = int(residual / problem_size.C);
-    thread_c[n] = int(residual % problem_size.C);
-  }
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-  // Compute convolution
-  for (int N = 0; N < problem_size.N; ++N) {
-    for (int P = 0; P < problem_size.P; ++P) {
-      for (int Q = 0; Q < problem_size.Q; ++Q) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          int thread_k = k_start + m;
-          element_A[m] = ElementAccumulator();
-          if (thread_k < problem_size.K) {
-            element_A[m] = ElementAccumulator(tensor_dy.at({N, P, Q, thread_k}));
-          }
-        }
-        // Load from filters tensor
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < kThreadN; ++n) {
-          // Load from activations tensor
-          int filter_r = thread_r[n];
-          int filter_s = thread_s[n];
-          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-            filter_r = problem_size.R - 1 - filter_r;
-            filter_s = problem_size.S - 1 - filter_s;
-          }
-          int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-          int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-          element_B[n] = ElementAccumulator();
-          if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W && thread_c[n] < problem_size.C) {
-            element_B[n] = ElementAccumulator(tensor_x.at({N, h, w, thread_c[n]}));
-          }
-        }
-        // Accumulate matrix product
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-          }
-        }
-      }
-    }
-  }
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    int thread_k = k_start + m;
-    if (thread_k < problem_size.K) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        if (thread_r[n] < problem_size.R && thread_s[n] < problem_size.S && thread_c[n] < problem_size.C) {
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}));
-          }
-          tensor_dw_out.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      }
-    }
-  }
-}
-// Conv3d wgrad kernel - dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 8,     // shape of a threadblock in units of threads
-  int kCtaShapeN = 16     // shape of a threadblock in units of threads
->
-__global__ void Conv3dWgrad(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-  int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int64_t trsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-  int thread_t[kThreadN];
-  int thread_r[kThreadN];
-  int thread_s[kThreadN];
-  int thread_c[kThreadN];
-  // Compute R, S, C coordinates for each row of a thread's tile
-  int64_t SC = int64_t(problem_size.S) * problem_size.C;
-  int64_t RSC = SC * problem_size.R;
-  CUTLASS_PRAGMA_UNROLL
-  for (int n = 0; n < kThreadN; ++n) {
-    int64_t trsc = trsc_start + n;
-    thread_t[n] = int(trsc / RSC);
-    int64_t residual = trsc % RSC;
-    thread_r[n] = int(residual / SC);
-    residual = residual % SC;
-    thread_s[n] = int(residual / problem_size.C);
-    thread_c[n] = int(residual % problem_size.C);
-  }
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-  // Compute convolution
-  for (int N = 0; N < problem_size.N; ++N) {
-    for (int Z = 0; Z < problem_size.Z; ++Z) {
-      for (int P = 0; P < problem_size.P; ++P) {
-        for (int Q = 0; Q < problem_size.Q; ++Q) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            int thread_k = k_start + m;
-            element_A[m] = ElementAccumulator();
-            if (thread_k < problem_size.K) {
-              element_A[m] = ElementAccumulator(tensor_dy.at({N, Z, P, Q, thread_k}));
-            }
-          }
-          // Load from filters tensor
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            // Load from activations tensor
-            int filter_t = thread_t[n];
-            int filter_r = thread_r[n];
-            int filter_s = thread_s[n];
-            if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-              filter_t = problem_size.T - 1 - filter_t;
-              filter_r = problem_size.R - 1 - filter_r;
-              filter_s = problem_size.S - 1 - filter_s;
-            }
-            int d = Z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
-            int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-            int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-            element_B[n] = ElementAccumulator();
-            if (d >= 0 && d < problem_size.D &&
-              h >= 0 && h < problem_size.H &&
-              w >= 0 && w < problem_size.W &&
-              thread_c[n] < problem_size.C) {
-              element_B[n] = ElementAccumulator(tensor_x.at({N, d, h, w, thread_c[n]}));
-            }
-          }
-          // Accumulate matrix product
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < kThreadN; ++n) {
-              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-            }
-          }
-        } // for (Q)
-      } // for (P)
-    } // for (Z)
-  } // for (N)
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    int thread_k = k_start + m;
-    if (thread_k < problem_size.K) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        if (thread_t[n] < problem_size.T &&
-          thread_r[n] < problem_size.R &&
-          thread_s[n] < problem_size.S &&
-          thread_c[n] < problem_size.C) {
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}));
-          }
-          tensor_dw_out.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      }
-    }
-  }
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace kernel
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Conv2d Fprop dispatcher - y = fprop(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv2dFprop(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-  int const kThreadM = 4;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
-  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
-  int64_t npq = int64_t(problem_size.N) * problem_size.P * problem_size.Q;
-  int64_t blocks_m = (npq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
-  kernel::Conv2dFprop<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_x,
-    tensor_w,
-    tensor_y_in,
-    tensor_y_out,
-    alpha,
-    beta
-  );
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-  return Status::kSuccess;
-}
-/// Conv3d Fprop dispatcher - y = fprop(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv3dFprop(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-  int const kThreadM = 4;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
-  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
-  int64_t nzpq = int64_t(problem_size.N) * problem_size.Z * problem_size.P * problem_size.Q;
-  int64_t blocks_m = (nzpq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
-  kernel::Conv3dFprop<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_x,
-    tensor_w,
-    tensor_y_in,
-    tensor_y_out,
-    alpha,
-    beta
-  );
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-  return Status::kSuccess;
-}
-/// Conv2d Dgrad dispatcher - dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv2dDgrad(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
-  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
-  int64_t nhw = int64_t(problem_size.N) * problem_size.H * problem_size.W;
-  int64_t blocks_m = (nhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
-  kernel::Conv2dDgrad<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_dy,
-    tensor_w,
-    tensor_dx_in,
-    tensor_dx_out,
-    alpha,
-    beta
-  );
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-  return Status::kSuccess;
-}
-/// Conv3d Dgrad dispatcher - dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv3dDgrad(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
-  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
-  int64_t ndhw = int64_t(problem_size.N) * problem_size.D * problem_size.H * problem_size.W;
-  int64_t blocks_m = (ndhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
-  kernel::Conv3dDgrad<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_dy,
-    tensor_w,
-    tensor_dx_in,
-    tensor_dx_out,
-    alpha,
-    beta
-  );
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-  return Status::kSuccess;
-}
-/// Conv2d Wgrad dispatcher - dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv2dWgrad(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 8;     // shape of a threadblock in units of threads
-  int const kCtaShapeN = 16;    // shape of a threadblock in units of threads
-  int64_t rsc = int64_t(problem_size.R) * problem_size.S * problem_size.C;
-  int64_t blocks_n = (rsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN);
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n));
-  kernel::Conv2dWgrad<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_dy,
-    tensor_x,
-    tensor_dw_in,
-    tensor_dw_out,
-    alpha,
-    beta
-  );
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-  return Status::kSuccess;
-}
-/// Conv3d Wgrad dispatcher - dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv3dWgrad(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 8;     // shape of a threadblock in units of threads
-  int const kCtaShapeN = 16;    // shape of a threadblock in units of threads
-  int64_t trsc = int64_t(problem_size.T) * problem_size.R * problem_size.S * problem_size.C;
-  int64_t blocks_n = (trsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN);
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n));
-  kernel::Conv3dWgrad<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_dy,
-    tensor_x,
-    tensor_dw_in,
-    tensor_dw_out,
-    alpha,
-    beta
-  );
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-  return Status::kSuccess;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv2d(
-  conv::Operator convolutional_operator,
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_A,
-  TensorRef<ElementB, LayoutB> tensor_B,
-  TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementC, LayoutC> tensor_D,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-  switch (convolutional_operator) {
-  case conv::Operator::kFprop:
-    return Conv2dFprop<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-    break;
-  case conv::Operator::kDgrad:
-    return Conv2dDgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-    break;
-  case conv::Operator::kWgrad:
-    return Conv2dWgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-    break;
-  default: break;
-  }
-  return Status::kErrorNotSupported;
-}
-/// Generic 3D convolution targeting Conv3dFprop, Conv3dDgrad, and Conv3dWgrad.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv3d(
-  conv::Operator convolutional_operator,
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_A,
-  TensorRef<ElementB, LayoutB> tensor_B,
-  TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementC, LayoutC> tensor_D,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-  switch (convolutional_operator) {
-  case conv::Operator::kFprop:
-    return Conv3dFprop<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-  case conv::Operator::kDgrad:
-    return Conv3dDgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-  case conv::Operator::kWgrad:
-    return Conv3dWgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-  default: break;
-  }
-  return Status::kErrorNotSupported;
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-}  // namespace device
-}  // namespace reference
-}  // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gemm.h DELETED Viewed

@@ -1,385 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in device-side code.
-*/
-#pragma once
-#include "cutlass/coord.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/util/reference/device/kernel/gemm.h"
-namespace cutlass {
-namespace reference {
-namespace device {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename InnerProductOp = multiply_add<AccumulatorType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  AccumulatorType initial_accum) {
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-  // Blocking structure potentially improves performance of reference implementation
-  // with a minor increase in complexity.
-  //
-  // Note, this reference implementation is NOT expected to approach peak performance.
-  using OutputTile = MatrixShape<4, 4>;
-  dim3 block(16, 8);
-  dim3 grid(
-    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
-    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)
-  );
-  // Launch a GEMM kernel
-  kernel::Gemm<
-    TensorRef<ElementA, LayoutA>,
-    TensorRef<ElementB, LayoutB>,
-    TensorRef<ElementC, LayoutC>,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    InnerProductOp,
-    ConvertOp
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    tensor_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    initial_accum
-  );
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename InnerProductOp = multiply_add<AccumulatorType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  AccumulatorType initial_accum) {
-  compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                ScalarType, AccumulatorType, InnerProductOp, ConvertOp>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
-        initial_accum);
-}
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
->
-struct Gemm;
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename AccumulatorType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-            ScalarType, AccumulatorType, arch::OpMultiplyAdd> {
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-      LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-      "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                  ScalarType, AccumulatorType, multiply_add<AccumulatorType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-      LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-      "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                ScalarType, AccumulatorType, multiply_add<AccumulatorType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for multiply-add-saturate
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename AccumulatorType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            AccumulatorType, arch::OpMultiplyAddSaturate> {
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, AccumulatorType, multiply_add<AccumulatorType>,
-                 NumericConverterClamp<ElementC, ScalarType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, AccumulatorType, multiply_add<AccumulatorType>,
-                 NumericConverterClamp<ElementC, ScalarType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for XOR-popc
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename AccumulatorType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            AccumulatorType, arch::OpXorPopc> {
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, AccumulatorType, xor_add<AccumulatorType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, AccumulatorType, xor_add<AccumulatorType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Batched GEMM
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a batch of GEMMs over a set of matrices of common dimension.
-//
-// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
-//
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename InnerProductOp,
-  typename ConvertOp
->
-void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  int batch_count,
-  ScalarType alpha,
-  TensorRefCollectionA const& tensor_a,
-  TensorRefCollectionB const& tensor_b,
-  ScalarType beta,
-  TensorRefCollectionC &tensor_c,
-  AccumulatorType initial_accum) {
-  static_assert(
-    TensorRefCollectionA::kRank == 2 &&
-    TensorRefCollectionB::kRank == 2 &&
-    TensorRefCollectionC::kRank == 2, "Tensors must be of rank 2");
-  // Blocking structure potentially improves performance of reference implementation
-  // with a minor increase in complexity.
-  //
-  // Note, this reference implementation is NOT expected to approach peak performance.
-  using OutputTile = MatrixShape<4, 4>;
-  dim3 block(16, 8);
-  dim3 grid(
-    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
-    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn),
-    batch_count
-  );
-  // Launch a GEMM kernel
-  kernel::BatchedGemm<
-    TensorRefCollectionA,
-    TensorRefCollectionB,
-    TensorRefCollectionC,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    InnerProductOp,
-    ConvertOp
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    tensor_b,
-    beta,
-    tensor_c,
-    initial_accum
-  );
-}
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-//
-// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
-//
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType
->
-void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  int batch_count,
-  ScalarType alpha,
-  TensorRefCollectionA const& tensor_a,
-  TensorRefCollectionB const& tensor_b,
-  ScalarType beta,
-  TensorRefCollectionC &tensor_c) {
-  BatchedGemm(problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace device
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gemm_complex.h DELETED Viewed

@@ -1,350 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in device-side code.
-*/
-#pragma once
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-namespace cutlass {
-namespace reference {
-namespace device {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-namespace kernel {
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  int kMblock = 4,
-  int kNblock = 4
->
-__global__ void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
-  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock;
-  int batch_idx = blockIdx.z;
-  tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
-  tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
-  tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
-  tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
-  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
-    // Compute matrix product using blocks
-    ComputeType accum[kMblock][kNblock];
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        accum[i][j] = initial_accum;
-      }
-    }
-    for (int k_block = 0; k_block < K; ++k_block) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < kNblock; j++) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < kMblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-          if (row < M && col < N) {
-            ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-            ElementB b = tensor_b.at(MatrixCoord(k_block, col));
-            ComputeType a_ik = ComputeType(a);
-            ComputeType b_kj = ComputeType(b);
-            if (transform_a == ComplexTransform::kConjugate) {
-              a_ik = conj(a_ik);
-            }
-            if (transform_b == ComplexTransform::kConjugate) {
-              b_kj = conj(b_kj);
-            }
-            accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
-          }
-        }
-      }
-    }
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        int row = row_block + i;
-        int col = col_block + j;
-        MatrixCoord coord = MatrixCoord(row, col);
-        if (row < M && col < N) {
-          tensor_d.at(coord) = convert_op(
-            alpha * ScalarType(accum[i][j]) +
-            beta * ScalarType(tensor_c.at(coord)));
-        }
-      }
-    }
-    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
-    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
-    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
-    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
-  } // for (batch_idx)
-}
-} // namespace kernel
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-  int const kMblock = 4;
-  int const kNblock = 4;
-  dim3 block(16, 8);
-  dim3 grid(
-    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
-    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
-    batch_count % std::numeric_limits<uint16_t>::max()
-  );
-  if (grid.y <= std::numeric_limits<uint16_t>::max()) {
-    kernel::GemmComplex<
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      ScalarType,
-      ComputeType,
-      ElementD,
-      ConvertOp,
-      InnerProductOp,
-      kMblock,
-      kNblock
-    ><<< grid, block >>>(
-      problem_size,
-      alpha,
-      tensor_a,
-      transform_a,
-      tensor_b,
-      transform_b,
-      beta,
-      tensor_c,
-      tensor_d,
-      initial_accum,
-      batch_count,
-      batch_stride_A,
-      batch_stride_B,
-      batch_stride_C,
-      batch_stride_D
-    );
-  } else {
-    // Using bigger thread tile size
-    int const kBigMblock = 4;
-    int const kBigNblock = 16;
-    dim3 Bigblock(16, 8);
-    dim3 Biggrid(
-      (problem_size.m() + block.x * kBigMblock - 1) / (block.x * kBigMblock),
-      (problem_size.n() + block.y * kBigNblock - 1) / (block.y * kBigNblock),
-      batch_count % std::numeric_limits<uint16_t>::max()
-    );
-    kernel::GemmComplex<
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      ScalarType,
-      ComputeType,
-      ElementD,
-      ConvertOp,
-      InnerProductOp,
-      kBigMblock,
-      kBigNblock
-    ><<< Biggrid, Bigblock >>>(
-      problem_size,
-      alpha,
-      tensor_a,
-      transform_a,
-      tensor_b,
-      transform_b,
-      beta,
-      tensor_c,
-      tensor_d,
-      initial_accum,
-      batch_count,
-      batch_stride_A,
-      batch_stride_B,
-      batch_stride_C,
-      batch_stride_D
-    );
-  }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ElementD = ElementC
->
-void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d) {
-  GemmComplex(problem_size, alpha, tensor_a, transform_a, tensor_b, transform_b, beta, tensor_c, tensor_d, ScalarType(0));
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace device
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h DELETED Viewed

@@ -1,311 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in device code.
-*/
-#pragma once
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_ref_planar_complex.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-namespace cutlass {
-namespace reference {
-namespace device {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-namespace kernel {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-static int const kGemmPlanarComplexBlockSize = 4;
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<complex<ComputeType>>
->
-__global__ void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
-  complex<ComputeType> initial_accum) {
-  int const kMblock = kGemmPlanarComplexBlockSize;
-  int const kNblock = kGemmPlanarComplexBlockSize;
-  using ComplexA = typename TensorRefPlanarComplex<ElementA, LayoutA>::ComplexElement;
-  using ComplexB = typename TensorRefPlanarComplex<ElementB, LayoutB>::ComplexElement;
-  using ComplexC = typename TensorRefPlanarComplex<ElementC, LayoutC>::ComplexElement;
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  complex<ComputeType> accum[kMblock][kNblock];
-  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
-  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock;
-  CUTLASS_PRAGMA_UNROLL
-  for (int j = 0; j < kNblock; j++) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kMblock; i++) {
-      accum[i][j] = initial_accum;
-    }
-  }
-  CUTLASS_PRAGMA_NO_UNROLL
-  for (int k_block = 0; k_block < K; ++k_block) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        int row = row_block + i;
-        int col = col_block + j;
-        if (row < M && col < N) {
-          ComplexA a_ik = tensor_a.at(MatrixCoord(row, k_block));
-          ComplexB b_kj = tensor_b.at(MatrixCoord(k_block, col));
-          complex<ComputeType> a = complex<ComputeType>{
-            ComputeType(a_ik.real()),
-            ComputeType(a_ik.imag())
-          };
-          complex<ComputeType> b = complex<ComputeType>{
-            ComputeType(b_kj.real()),
-            ComputeType(b_kj.imag())
-          };
-          if (transform_a == ComplexTransform::kConjugate) {
-            a = conj(a);
-          }
-          if (transform_b == ComplexTransform::kConjugate) {
-            b = conj(b);
-          }
-          accum[i][j] = inner_product_op(a, b,  accum[i][j]);
-        }
-      }
-    }
-  }
-  CUTLASS_PRAGMA_UNROLL
-  for (int j = 0; j < kNblock; j++) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kMblock; i++) {
-      int row = row_block + i;
-      int col = col_block + j;
-      MatrixCoord coord = MatrixCoord(row, col);
-      if (row < M && col < N) {
-        complex<ScalarType> acc{
-          ScalarType(accum[i][j].real()),
-          ScalarType(accum[i][j].imag())
-        };
-        ComplexC c_ij = ComplexC();
-        if (beta.real() != ScalarType() || beta.imag() != ScalarType()) {
-          c_ij = tensor_c.at(coord);
-        }
-        complex<ScalarType> src{
-          ScalarType(c_ij.real()),
-          ScalarType(c_ij.imag())
-        };
-        complex<ScalarType> result = alpha * acc + beta * src;
-        ComplexC d_ij;
-        d_ij.real() = convert_op(result.real());
-        d_ij.imag() = convert_op(result.imag());
-        tensor_d.at(coord) = d_ij;
-      }
-    }
-  }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace kernel
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<complex<ComputeType>>
->
-void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
-  complex<ComputeType> initial_accum) {
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-  int const kMblock = kernel::kGemmPlanarComplexBlockSize;
-  int const kNblock = kernel::kGemmPlanarComplexBlockSize;
-  dim3 block(16, 8);
-  dim3 grid(
-    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
-    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
-    1);
-  kernel::GemmPlanarComplex<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ScalarType,
-    ComputeType,
-    ConvertOp,
-    InnerProductOp
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    transform_a,
-    tensor_b,
-    transform_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    initial_accum
-  );
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d) {
-  GemmPlanarComplex(
-    problem_size,
-    alpha,
-    tensor_a, transform_a,
-    tensor_b, transform_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    complex<ScalarType>());
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace device
-} // namespace reference
-} // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/gett.hpp DELETED Viewed

@@ -1,146 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief GETT device reference code
-*/
-#pragma once
-#include <cute/tensor.hpp>
-namespace cutlass::reference::device {
-template <
-  class ATensor,
-  class BTensor,
-  class CTensor,
-  class DTensor,
-  class ElementAccumulator,
-  class ElementEpilogue>
-__global__ static
-void
-gett_kernel(
-  DTensor       D,
-  ATensor const A,
-  BTensor const B,
-  CTensor const C,
-  ElementEpilogue alpha, ElementEpilogue beta,
-  ElementAccumulator acc_init)
-{
-  using namespace cute;
-  static_assert(DTensor::rank == 3, "(M,N,L)");
-  static_assert(ATensor::rank == 3, "(M,K,L)");
-  static_assert(BTensor::rank == 3, "(N,K,L)");
-  static_assert(CTensor::rank == 3, "(M,N,L)");
-  assert(size<0>(A) == size<0>(D));  // M
-  assert(size<0>(C) == size<0>(D));  // M
-  assert(size<0>(B) == size<1>(D));  // N
-  assert(size<1>(C) == size<1>(D));  // N
-  assert(size<1>(A) == size<1>(B));  // K
-  assert(size<2>(A) == size<2>(D));  // L
-  assert(size<2>(B) == size<2>(D));  // L
-  assert(size<2>(C) == size<2>(D));  // L
-  NumericConverter<ElementAccumulator, typename ATensor::value_type> a_converter;
-  NumericConverter<ElementAccumulator, typename BTensor::value_type> b_converter;
-  NumericConverter<ElementEpilogue, ElementAccumulator> acc_converter;
-  NumericConverter<ElementEpilogue, typename CTensor::value_type> source_converter;
-  NumericConverter<typename DTensor::value_type, ElementEpilogue> output_converter;
-  // Thread id to each element of D
-  for (int tid = threadIdx.x + blockDim.x * blockIdx.x;
-       tid < size(D);
-       tid += blockDim.x * gridDim.x) {
-    // (m,n,l) coordinate
-    auto mnl_coord = idx2crd(tid, product_each(shape(D)));
-    auto m = get<0>(mnl_coord);
-    auto n = get<1>(mnl_coord);
-    auto l = get<2>(mnl_coord);
-    auto A_ml = A(m,_,l);
-    auto B_nl = B(n,_,l);
-    ElementAccumulator accum = ElementAccumulator(0);
-    for (int k = 0; k < size<1>(A); ++k) {
-      ElementAccumulator a = a_converter(A_ml(k));
-      ElementAccumulator b = b_converter(B_nl(k));
-      accum += a * b;
-    }
-    ElementEpilogue scaled_output = (alpha * acc_converter(accum)) + (beta * source_converter(C(m,n,l)));
-    D(m,n,l) = output_converter(scaled_output);
-  }
-}
-// Most general version
-template <
-  class ProblemShapeMNKL,
-  class ElementA,
-  class StrideA,
-  class ElementB,
-  class StrideB,
-  class ElementAccumulator,
-  class ElementC,
-  class StrideC,
-  class ElementD,
-  class StrideD,
-  class ElementEpilogue>
-void
-gett(
-    ProblemShapeMNKL problem_shape_mnkl,
-    ElementA const* ptr_A, StrideA stride_a_mkl,
-    ElementB const* ptr_B, StrideB stride_b_nkl,
-    ElementAccumulator _,
-    ElementC const* ptr_C, StrideC stride_c_mnl,
-    ElementD      * ptr_D, StrideD stride_d_mnl,
-    ElementEpilogue alpha, ElementEpilogue beta,
-    cudaStream_t stream = 0) {
-  using namespace cute;
-  static_assert(cute::rank(ProblemShapeMNKL{}) == 4);
-  auto M = get<0>(problem_shape_mnkl);
-  auto N = get<1>(problem_shape_mnkl);
-  auto K = get<2>(problem_shape_mnkl);
-  auto L = get<3>(problem_shape_mnkl);
-  // Represent the full tensors
-  auto A = make_tensor(make_gmem_ptr(ptr_A), make_shape(M,K,L), stride_a_mkl); // (M,K,L)
-  auto B = make_tensor(make_gmem_ptr(ptr_B), make_shape(N,K,L), stride_b_nkl); // (N,K,L)
-  auto C = make_tensor(make_gmem_ptr(ptr_C), make_shape(M,N,L), stride_c_mnl); // (M,N,L)
-  auto D = make_tensor(make_gmem_ptr(ptr_D), make_shape(M,N,L), stride_d_mnl); // (M,N,L)
-  dim3 dimBlock(256);
-  dim3 dimGrid(240);
-  gett_kernel<<< dimGrid, dimBlock, 0, stream >>>(D, A, B, C, alpha, beta, ElementAccumulator(0));
-}
-} // namespace cutlass::reference::device

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/kernel/gemm.h DELETED Viewed

@@ -1,162 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-#pragma once
-#include "cutlass/coord.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/util/reference/device/thread/gemm.h"
-namespace cutlass {
-namespace reference {
-namespace device {
-namespace kernel {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename TensorRefA,
-  typename TensorRefB,
-  typename TensorRefC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename OutputTile,
-  typename InnerProductOp,
-  typename ConvertOp
->
-__global__ void Gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRefA tensor_a,
-  TensorRefB tensor_b,
-  ScalarType beta,
-  TensorRefC tensor_c,
-  TensorRefC tensor_d,
-  AccumulatorType initial_accum) {
-  // Map each thread to a unique tile of the output matrix
-  MatrixCoord output_coord(
-    MatrixCoord::Index((threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kRow),
-    MatrixCoord::Index((threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kColumn)
-  );
-  // Compute the general matrix product
-  thread::Gemm<
-    TensorRefA,
-    TensorRefB,
-    TensorRefC,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    InnerProductOp,
-    ConvertOp
-  > gemm(initial_accum);
-  gemm.multiply_add(
-    problem_size,
-    tensor_a,
-    tensor_b,
-    output_coord);
-  gemm.epilogue(problem_size, alpha, beta, tensor_c, tensor_d, output_coord);
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename OutputTile,
-  typename InnerProductOp,
-  typename ConvertOp
->
-__global__ void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRefCollectionA tensor_collection_a,
-  TensorRefCollectionB tensor_collection_b,
-  ScalarType beta,
-  TensorRefCollectionC tensor_collection_c,
-  AccumulatorType initial_accum) {
-  // Obtain batch ID
-  int batch_id = blockIdx.z;
-  // Dereference based on batch_id
-  typename TensorRefCollectionA::TensorRef tensor_a = tensor_collection_a.at(batch_id);
-  typename TensorRefCollectionB::TensorRef tensor_b = tensor_collection_b.at(batch_id);
-  typename TensorRefCollectionC::TensorRef tensor_c = tensor_collection_c.at(batch_id);
-  // Map each thread to a unique tile of the output matrix
-  MatrixCoord output_coord(
-    (threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kColumn,
-    (threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kRow
-  );
-  // Compute the general matrix product
-  thread::Gemm<
-    typename TensorRefCollectionA::TensorRef,
-    typename TensorRefCollectionB::TensorRef,
-    typename TensorRefCollectionC::TensorRef,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    InnerProductOp,
-    ConvertOp
-  > gemm(initial_accum);
-  gemm.multiply_add(
-    problem_size,
-    tensor_a,
-    tensor_b,
-    output_coord);
-  gemm.epilogue(problem_size, alpha, beta, tensor_c, output_coord);
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace kernel
-} // namespace device
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h DELETED Viewed

@@ -1,168 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include <curand_kernel.h>
-#include "cutlass/cutlass.h"
-namespace cutlass {
-namespace reference {
-namespace device {
-namespace kernel {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Kernel to initialize tensor to uniform random distribution
-template <typename T>
-__global__ void TensorInitializeUniform(
-    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
-  __shared__ curandState_t rng_state[1024];
-  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
-  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
-  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int s_idx = blockIdx.y * blockDim.x;
-  tensor += s_idx * ldm + c_idx;
-  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
-    if (s_idx < dim_strided && c_idx < dim_contiguous) {
-      double range = dist.uniform.max - dist.uniform.min;
-      double rnd = curand_uniform(&rng_state[threadIdx.x]);
-      rnd = dist.uniform.min + range * rnd;
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-      if (dist.int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << dist.int_scale)));
-        *tensor = T(rnd / double(1 << dist.int_scale));
-      } else {
-        *tensor = T(rnd);
-      }
-      tensor += ldm;
-    }
-  }
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Kernel to initialize tensor to uniform distribution
-template <typename T>
-__global__ void TensorInitializeGaussian(
-    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
-  __shared__ curandState_t rng_state[1024];
-  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
-  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
-  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int s_idx = blockIdx.y * blockDim.x;
-  tensor += s_idx * ldm + c_idx;
-  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
-    if (s_idx < dim_strided && c_idx < dim_contiguous) {
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-      double rnd = curand_normal(&rng_state[threadIdx.x]);
-      rnd = dist.gaussian.mean + dist.gaussian.stddev * rnd;
-      if (dist.int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << dist.int_scale)));
-        *tensor = T(rnd / double(1 << dist.int_scale));
-      } else {
-        *tensor = T(rnd);
-      }
-    }
-  }
-}
-/// Kernel to initialize tensor to an identity matrix
-template <typename T>
-__global__ void TensorInitializeLinear(
-    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
-  __shared__ curandState_t rng_state[1024];
-  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
-  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
-  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int s_idx = blockIdx.y * blockDim.x;
-  tensor += s_idx * ldm + c_idx;
-  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
-    if (s_idx < dim_strided && c_idx < dim_contiguous) {
-      *tensor =
-          dist.linear.offset + dist.linear.delta_row * c_idx + dist.linear.delta_column * s_idx;
-    }
-  }
-}
-/// Kernel to initialize tensor to an identity matrix
-template <typename T>
-__global__ void TensorInitializeIdentity(
-    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
-  __shared__ curandState_t rng_state[1024];
-  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
-  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
-  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int s_idx = blockIdx.y * blockDim.x;
-  tensor += s_idx * ldm + c_idx;
-  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
-    if (s_idx < dim_strided && c_idx < dim_contiguous) {
-      *tensor = (c_idx == s_idx ? T(1) : T(0));
-    }
-  }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace kernel
-} // namespace device
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h DELETED Viewed

@@ -1,159 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-#include "cutlass/subbyte_reference.h"
-#include "cutlass/fast_math.h"
-namespace cutlass {
-namespace reference {
-namespace device {
-namespace kernel {
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines several helpers
-namespace detail {
-/// Helper to perform for-each operation
-template <typename Func, int Rank, int RankRemaining>
-struct TensorForEachHelper {
-  /// Constructor for general rank
-  __inline__ __device__
-  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
-    int64_t product = 1;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = Rank - RankRemaining; i < Rank; ++i) {
-      product *= size[i];
-    }
-    coord[Rank - 1 - RankRemaining] = index / product;
-    int64_t remaining = index % product;
-    TensorForEachHelper<Func, Rank, RankRemaining-1>(func, size, coord, remaining);
-  }
-};
-/// Helper to perform for-each operation
-template <typename Func, int Rank>
-struct TensorForEachHelper<Func, Rank, 0> {
-  /// Constructor for fastest changing rank
-  __inline__ __device__
-  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
-    coord[Rank - 1] = index;
-    if (coord < size) {
-      func(coord);
-    }
-  }
-};
-} // namespace detail
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Kernel calls a functor for each element in a tensor's index space
-template <typename Func, int Rank, typename Params>
-__global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
-  Func func(params);
-  int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
-  int64_t max_index = 1;
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < Rank; ++i) {
-    max_index *= size[i];
-  }
-  CUTLASS_PRAGMA_NO_UNROLL
-  while  (index < max_index) {
-    Coord<Rank> coord;
-    detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord, index);
-    index += blockDim.x * gridDim.x;
-  }
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Kernel calls a functor for each element along a tensor's diagonal
-template <typename Func, int Rank, typename Params>
-__global__ void TensorDiagonalForEach(Coord<Rank> size, Params params, int start, int end) {
-  Func func(params);
-  int64_t index = threadIdx.x + blockIdx.x * blockDim.x + start;
-  if (index < end) {
-    Coord<Rank> coord;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Rank; ++i) {
-      coord[i] = index;
-    }
-    func(coord);
-  }
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-template <typename Element, typename Func>
-__global__ void BlockForEach(
-  Element *ptr,
-  size_t capacity,
-  typename Func::Params params) {
-  Func func(params);
-  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
-  for (; index < capacity; index += blockDim.x * gridDim.x) {
-    ReferenceFactory<Element>::get(ptr, index) = func();
-  }
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace kernel
-} // namespace device
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h DELETED Viewed

@@ -1,355 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in device-side code.
-*/
-#pragma once
-#include "cutlass/blas3.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-namespace cutlass {
-namespace reference {
-namespace device {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-namespace kernel {
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  int kMblock = 4,
-  int kNblock = 4
->
-__global__ void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  FillMode fill_mode_c,
-  BlasMode blas_mode,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-  assert(M=N);
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
-  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock;
-  int batch_idx = blockIdx.z;
-  tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
-  tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
-  tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
-  tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
-  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
-    // Compute matrix product using blocks
-    ComputeType accum[kMblock][kNblock];
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        accum[i][j] = initial_accum;
-      }
-    }
-    for (int k_block = 0; k_block < K; ++k_block) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < kNblock; j++) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < kMblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-          if (row < M && col < N &&
-             ( (fill_mode_c == FillMode::kLower && row >= col) ||
-              (fill_mode_c == FillMode::kUpper && row <= col) )
-            ) {
-            // A x B^T (Symmetric) or A x B^H (Hermitian)
-            // complex conjugation on operandB (b_t) is function of blas3 computation
-            ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-            ElementB b_t = (blas_mode == BlasMode::kHermitian) ?
-                          conj(tensor_b.at(MatrixCoord(col, k_block))) :
-                          tensor_b.at(MatrixCoord(col, k_block));
-            ComputeType a_ik = ComputeType(a);
-            ComputeType b_jk = ComputeType(b_t);
-            // complex conjugation is a function of operand layouts
-            if (transform_a == ComplexTransform::kConjugate) {
-              a_ik = conj(a_ik);
-            }
-            // complex conjugation is a function of operand layouts
-            if (transform_b == ComplexTransform::kConjugate) {
-              b_jk = conj(b_jk);
-            }
-            accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
-            // B x A^T (Symmetric) or B x A^H (Hermitian)
-            // complex conjugation on operandB (a_t) is function of blas3 computation
-            ElementB b = tensor_b.at(MatrixCoord(row, k_block));
-            ElementA a_t = (blas_mode == BlasMode::kHermitian) ?
-                            conj(tensor_a.at(MatrixCoord(col, k_block))):
-                            tensor_a.at(MatrixCoord(col, k_block));
-            ComputeType b_ik = ComputeType(b);
-            ComputeType a_jk = ComputeType(a_t);
-            // complex conjugation here is a function of operand layouts
-            if (transform_b == ComplexTransform::kConjugate) {
-              b_ik = conj(b_ik);
-            }
-            // complex conjugation here is a function of operand layouts
-            if (transform_a == ComplexTransform::kConjugate) {
-              a_jk = conj(a_jk);
-            }
-            accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
-          }
-        }
-      }
-    }
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        int row = row_block + i;
-        int col = col_block + j;
-        MatrixCoord coord = MatrixCoord(row, col);
-        if (row < M && col < N &&
-            ((fill_mode_c == FillMode::kLower && row >= col) ||
-             (fill_mode_c == FillMode::kUpper && row <= col))
-          ) {
-          ScalarType c = tensor_c.at(coord);
-          // The imaginary parts of the diagonal elements of
-          // a complex data type are assumed and set to zero
-          if (blas_mode == BlasMode::kHermitian) {
-            c = (row == col) ? real(c) : c;
-          }
-          tensor_d.at(coord) = convert_op(
-            alpha * ScalarType(accum[i][j]) +
-            beta * c);
-        }
-      }
-    }
-    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
-    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
-    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
-    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
-  } // for (batch_idx)
-}
-} // namespace kernel
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  FillMode fill_mode_c,
-  BlasMode blas_mode,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-  int const kMblock = 4;
-  int const kNblock = 4;
-  dim3 block(16, 8);
-  dim3 grid(
-    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
-    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
-    batch_count % std::numeric_limits<uint16_t>::max()
-  );
-  kernel::Rank2KComplex<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ScalarType,
-    ComputeType,
-    ConvertOp,
-    InnerProductOp,
-    kMblock,
-    kNblock
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    transform_a,
-    tensor_b,
-    transform_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    initial_accum,
-    fill_mode_c,
-    blas_mode,
-    batch_count,
-    batch_stride_A,
-    batch_stride_B,
-    batch_stride_C,
-    batch_stride_D
-  );
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  FillMode fill_mode_c,
-  BlasMode blas_mode) {
-  Rank2KComplex(
-    problem_size, alpha,
-    tensor_a, transform_a,
-    tensor_b, transform_b,
-    beta, tensor_c, tensor_d,
-    ScalarType(0),
-    fill_mode_c,
-    blas_mode);
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace device
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_compare.h DELETED Viewed

@@ -1,250 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines host-side elementwise operations on TensorView.
-*/
-#pragma once
-// Standard Library includes
-#include <utility>
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/relatively_equal.h"
-#include "cutlass/util/distribution.h"
-#include "tensor_foreach.h"
-namespace cutlass {
-namespace reference {
-namespace device {
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace kernel {
-template <typename Element>
-__global__ void BlockCompareEqual(
-  int *equal,
-  Element const *ptr_A,
-  Element const *ptr_B,
-  size_t capacity) {
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
-    Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
-    Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
-    if (a != b) {
-      *equal = 0;
-      return;
-    }
-  }
-}
-template <typename Element>
-__global__ void BlockCompareRelativelyEqual(
-  int *equal,
-  Element const *ptr_A,
-  Element const *ptr_B,
-  size_t capacity,
-  Element epsilon,
-  Element nonzero_floor) {
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
-    Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
-    Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
-    if (!relatively_equal(a, b, epsilon, nonzero_floor)) {
-      *equal = 0;
-      return;
-    }
-  }
-}
-} // namespace kernel
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Performs a bit-level equality check between two blocks
-template <typename Element>
-bool BlockCompareEqual(
-  Element const *ptr_A,
-  Element const *ptr_B,
-  size_t capacity,
-  int grid_size = 0,
-  int block_size = 0,
-  cudaStream_t stream = nullptr) {
-  int equal_flag = 1;
-  int *device_equal_flag = nullptr;
-  if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
-    throw std::runtime_error("Failed to allocate device flag.");
-  }
-  if (cudaMemcpy(
-    device_equal_flag,
-    &equal_flag,
-    sizeof(int),
-    cudaMemcpyHostToDevice) != cudaSuccess) {
-    throw std::runtime_error("Failed to copy equality flag to device.");
-  }
-  if (!grid_size || !block_size) {
-    // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
-    cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
-      &grid_size,
-      &block_size,
-      reinterpret_cast<void const *>(kernel::BlockCompareEqual<Element>));
-    if (result != cudaSuccess) {
-      throw std::runtime_error("Failed to query occupancy.");
-    }
-    // Limit block size. This has the effect of increasing the number of items processed by a
-    // single thread and reduces the impact of initialization overhead.
-    block_size = (block_size < 128 ? block_size : 128);
-  }
-  dim3 grid(grid_size, 1, 1);
-  dim3 block(block_size, 1, 1);
-  kernel::BlockCompareEqual<Element><<< grid, block, 0, stream >>>(device_equal_flag, ptr_A, ptr_B, capacity);
-  cudaStreamSynchronize(stream);
-  if (cudaMemcpy(
-    &equal_flag,
-    device_equal_flag,
-    sizeof(int),
-    cudaMemcpyDeviceToHost) != cudaSuccess) {
-    cudaFree(device_equal_flag);
-    throw std::runtime_error("Failed to copy equality flag from device.");
-  }
-  cudaFree(device_equal_flag);
-  return equal_flag;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Performs a bit-level equality check between two blocks
-template <typename Element>
-bool BlockCompareRelativelyEqual(
-  Element const *ptr_A,
-  Element const *ptr_B,
-  size_t capacity,
-  Element epsilon,
-  Element nonzero_floor,
-  int grid_size = 0,
-  int block_size = 0,
-  cudaStream_t stream = nullptr) {
-  int equal_flag = 1;
-  int *device_equal_flag = nullptr;
-  if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
-    throw std::runtime_error("Failed to allocate device flag.");
-  }
-  if (cudaMemcpy(
-    device_equal_flag,
-    &equal_flag,
-    sizeof(int),
-    cudaMemcpyHostToDevice) != cudaSuccess) {
-    throw std::runtime_error("Failed to copy equality flag to device.");
-  }
-  if (!grid_size || !block_size) {
-    // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
-    cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
-      &grid_size,
-      &block_size,
-      reinterpret_cast<void const *>(kernel::BlockCompareRelativelyEqual<Element>));
-    if (result != cudaSuccess) {
-      throw std::runtime_error("Failed to query occupancy.");
-    }
-    // Limit block size. This has the effect of increasing the number of items processed by a
-    // single thread and reduces the impact of initialization overhead.
-    block_size = (block_size < 128 ? block_size : 128);
-  }
-  dim3 grid(grid_size, 1, 1);
-  dim3 block(block_size, 1, 1);
-  kernel::BlockCompareRelativelyEqual<Element><<< grid, block, 0, stream >>>(
-    device_equal_flag,
-    ptr_A,
-    ptr_B,
-    capacity,
-    epsilon,
-    nonzero_floor
-  );
-  cudaStreamSynchronize(stream);
-  if (cudaMemcpy(
-    &equal_flag,
-    device_equal_flag,
-    sizeof(int),
-    cudaMemcpyDeviceToHost) != cudaSuccess) {
-    cudaFree(device_equal_flag);
-    throw std::runtime_error("Failed to copy equality flag from device.");
-  }
-  cudaFree(device_equal_flag);
-  return equal_flag;
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-} // device
-} // reference
-} // cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_fill.h DELETED Viewed

@@ -1,2075 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
-    in this header are not specialized for any particular data layout and are therefore not
-    intended to offer the best possible performance. Rather, they are intended to be generic
-    reference implementations to support the CUTLASS unit tests.
-*/
-#pragma once
-#if !defined(__CUDACC_RTC__)
-// Standard Library includes
-#include <utility>
-#include <cstdlib>
-#include <cmath>
-#include <type_traits>
-#include <cstdint>
-#endif
-// CUDA includes
-#include <curand_kernel.h>
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/complex.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/blas3.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/util/reference/device/tensor_foreach.h"
-#include "cutlass/util/distribution.h"
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace reference {
-namespace device {
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-template <typename FloatType>
-CUTLASS_DEVICE
-FloatType random_normal_float(curandState_t *state) {
-  return curand_normal(state);
-}
-template <>
-CUTLASS_DEVICE
-double random_normal_float<double>(curandState_t *state) {
-  return curand_normal_double(state);
-}
-template <typename FloatType>
-CUTLASS_DEVICE
-FloatType random_uniform_float(curandState_t *state) {
-  return curand_uniform(state);
-}
-template <>
-CUTLASS_DEVICE
-double random_uniform_float<double>(curandState_t *state) {
-  return curand_uniform_double(state);
-}
-template <typename Element>
-struct RandomGaussianFunc {
-  using FloatType = typename std::conditional<(sizeof(Element) > 4), double, float>::type;
-  using IntType = typename std::conditional<(sizeof(Element) > 4), int64_t, int>::type;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    uint64_t seed;
-    FloatType mean;
-    FloatType stddev;
-    int int_scale;
-    FloatType float_scale_up;
-    FloatType float_scale_down;
-    int exclude_zero;           ///< If non-negative, excludes zeros
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0,
-      Element mean_ = 0,
-      Element stddev_ = 1,
-      int int_scale_ = -1,
-      int exclude_zero_ = -1
-    ):
-      seed(seed_),
-      mean(static_cast<FloatType>(mean_)),
-      stddev(static_cast<FloatType>(stddev_)),
-      int_scale(int_scale_),
-      exclude_zero(exclude_zero_) {
-      float_scale_up = FloatType(IntType(1) << int_scale); // scale up to clamp low order bits
-      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
-    }
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  /// RNG state object
-  curandState_t rng_state;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomGaussianFunc(Params const &params): params(params) {
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-    FloatType rnd = random_normal_float<FloatType>(&rng_state);
-    rnd = params.mean + params.stddev * rnd;
-    Element result;
-    if (params.int_scale >= 0) {
-      rnd = FloatType(std::llround(rnd * params.float_scale_up));
-      result = Element(rnd * params.float_scale_down);
-    }
-    else {
-      result = Element(rnd);
-    }
-    if (params.exclude_zero >=0 && result == Element(0.0)) {
-      if (rnd > FloatType(0)) {
-        rnd += FloatType(1);
-      } else {
-        rnd -= FloatType(1);
-      }
-      result = Element(rnd);
-    }
-    return result;
-  }
-};
-template <typename Real>
-struct RandomGaussianFunc<complex<Real>> {
-  using Element = complex<Real>;
-  using FloatType = typename std::conditional<(sizeof(Real) > 4), double, float>::type;
-  using IntType = typename std::conditional<(sizeof(Real) > 4), int64_t, int>::type;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    uint64_t seed;
-    FloatType mean;
-    FloatType stddev;
-    int int_scale;
-    FloatType float_scale_up;
-    FloatType float_scale_down;
-    int exclude_zero;           ///< If non-negative, excludes zeros
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0,
-      Real mean_ = 0,
-      Real stddev_ = 1,
-      int int_scale_ = -1,
-      int exclude_zero_ = -1
-    ):
-      seed(seed_),
-      mean(static_cast<FloatType>(mean_)),
-      stddev(static_cast<FloatType>(stddev_)),
-      int_scale(int_scale_),
-      exclude_zero(exclude_zero_) {
-      float_scale_up = FloatType(IntType(1) << int_scale);
-      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
-    }
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  /// RNG state object
-  curandState_t rng_state;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomGaussianFunc(Params const &params): params(params) {
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-    FloatType rnd_r = random_normal_float<FloatType>(&rng_state);
-    FloatType rnd_i = random_normal_float<FloatType>(&rng_state);
-    rnd_r = params.mean + params.stddev * rnd_r;
-    rnd_i = params.mean + params.stddev * rnd_i;
-    Element result;
-    if (params.int_scale >= 0) {
-      rnd_r = FloatType(std::llround(rnd_r * params.float_scale_up));
-      rnd_i = FloatType(std::llround(rnd_i * params.float_scale_up));
-      result = {
-        Real(rnd_r * params.float_scale_down),
-        Real(rnd_i * params.float_scale_down)
-      };
-    }
-    else {
-      result = Element(Real(rnd_r), Real(rnd_i));
-    }
-    if (params.exclude_zero >= 0 &&
-        result.real() == Real(0.0) &&
-        result.imag() == Real(0.0)) {
-      if (rnd_r > FloatType(0)) {
-        rnd_r += FloatType(1);
-      } else {
-        rnd_r -= FloatType(1);
-      }
-      result = Element(Real(rnd_r), Real(rnd_i));
-    }
-    return result;
-  }
-};
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomGaussianFunc {
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-  /// Scalar type
-  typedef typename TensorView::Element T;
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-  using RandomFunc = RandomGaussianFunc<Element>;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    TensorView view;
-    typename RandomFunc::Params random;
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      typename RandomFunc::Params random_ = typename RandomFunc::Params()
-    ):
-      view(view_), random(random_) {
-    }
-  };
-  //
-  // Data members
-  //
-  Params params;
-  RandomFunc random;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillRandomGaussianFunc(Params const &params): params(params), random(params.random) {
-  }
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    params.view.at(coord) = random();
-  }
-};
-} // namespace detail
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a tensor with random values with a Gaussian distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomGaussian(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  typename RealType<Element>::Type mean = Element(0),   ///< Gaussian distribution's mean
-  typename RealType<Element>::Type stddev = Element(1), ///< Gaussian distribution's standard deviation
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  int exclude_zero = -1,                  ///< If non-negative, excludes zeros from tensor init
-  cudaStream_t stream = nullptr) {
-  using RandomFunc = detail::RandomGaussianFunc<Element>;
-  using Func = detail::TensorFillRandomGaussianFunc<Element, Layout>;
-  using Params = typename Func::Params;
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, typename RandomFunc::Params(seed, mean, stddev, bits, exclude_zero)),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a tensor with random values with a Gaussian distribution.
-template <typename Element>               ///< Element type
-void BlockFillRandomGaussian(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                              ///< seed for RNG
-  typename RealType<Element>::Type mean,      ///< Gaussian distribution's mean
-  typename RealType<Element>::Type stddev,    ///< Gaussian distribution's standard deviation
-  int bits = -1,                              ///< If non-negative, specifies number of fractional bits that
-                                              ///  are not truncated to zero. Permits reducing precision of
-                                              ///  data.
-  cudaStream_t stream = nullptr) {
-  using RandomFunc = detail::RandomGaussianFunc<Element>;
-  typename RandomFunc::Params params(seed, mean, stddev, bits);
-  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-/// Computes a random uniform distribution
-template <typename Element>                ///< Element type
-struct RandomUniformFunc {
-  using FloatType = typename std::conditional<
-    (sizeof(Element) > 4),
-    double,
-    float>::type;
-  using IntType = typename std::conditional<
-    (sizeof(Element) > 4),
-    int64_t,
-    int>::type;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    uint64_t seed;
-    FloatType range;
-    FloatType max;
-    int int_scale;
-    double pnan;
-    FloatType float_scale_up;
-    FloatType float_scale_down;
-    int exclude_zero;           ///< If non-negative, excludes zeros
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0,
-      Element max_ = 1,
-      Element min = 0,
-      int int_scale_ = -1,
-      double pnan_ = 0,
-      int exclude_zero_ = -1
-    ):
-      seed(seed_),
-      range(static_cast<FloatType>(max_) - static_cast<FloatType>(min)),
-      max(static_cast<FloatType>(max_)),
-      int_scale(int_scale_),
-      pnan(pnan_),
-      exclude_zero(exclude_zero_) {
-      float_scale_up = FloatType(IntType(1) << int_scale); // scale up to clamp low order bits
-      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
-      // Handle cases where min = 0 or max = 0 for excluding zeros
-      if (exclude_zero >= 0) {
-        range = (min == Element(0)) ? range - FloatType(1): range;
-        max = (max_ == Element(0)) ? max - FloatType(1): max;
-      }
-    }
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  /// RNG state object
-  curandState_t rng_state;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomUniformFunc(Params const &params): params(params) {
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
-    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
-      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
-        return Element(NAN);
-      }
-    }
-    FloatType rnd = random_uniform_float<FloatType>(&rng_state);
-    rnd = params.max - params.range * rnd;
-    // Random values are cast to integer after scaling by a power of two to facilitate error
-    // testing
-    Element result;
-    if (params.int_scale >= 0) {
-      rnd = FloatType(std::llround(rnd * params.float_scale_up));
-      result = Element(rnd * params.float_scale_down);
-    }
-    else {
-      result = Element(rnd);
-    }
-    if (params.exclude_zero >=0 && result == Element(0.0)) {
-      if (rnd > FloatType(0)) {
-        rnd = std::min(params.max, rnd + FloatType(1));
-      } else {
-        rnd = std::max((params.max - params.range), rnd - FloatType(1));
-      }
-      result = Element(rnd);
-    }
-    return result;
-  }
-};
-/// Computes a random Gaussian distribution
-template <typename Real>
-struct RandomUniformFunc<complex<Real>> {
-  using Element = complex<Real>;
-  using FloatType = typename std::conditional<
-    (sizeof(Real) > 4),
-    double,
-    float>::type;
-  using IntType = typename std::conditional<
-    (sizeof(Real) > 4),
-    int64_t,
-    int>::type;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    uint64_t seed;
-    FloatType range;
-    FloatType min;
-    int int_scale;
-    double pnan;
-    FloatType float_scale_up;
-    FloatType float_scale_down;
-    int exclude_zero;           ///< If non-negative, excludes zeros
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0,
-      FloatType max = 1,
-      FloatType min_ = 0,
-      int int_scale_ = -1,
-      double pnan_ = 0,
-      int exclude_zero_ = -1
-    ):
-      seed(seed_),
-      range(static_cast<FloatType>(max - min_)),
-      min(static_cast<FloatType>(min_)),
-      int_scale(int_scale_),
-      pnan(pnan_),
-      exclude_zero(exclude_zero_) {
-      float_scale_up = FloatType(IntType(1) << int_scale);
-      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
-      // Handle cases where min = 0 or max = 0 for excluding zeros
-      if (exclude_zero >= 0) {
-        min = (min == FloatType(0)) ? min + FloatType(1): min;
-        range = (max == FloatType(0)) ? range - FloatType(1): range;
-      }
-    }
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  /// RNG state object
-  curandState_t rng_state;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomUniformFunc(Params const &params): params(params) {
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
-    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
-      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
-        return Element(Real(NAN), Real(NAN));
-      }
-    }
-    FloatType rnd_r = random_uniform_float<FloatType>(&rng_state);
-    FloatType rnd_i = random_uniform_float<FloatType>(&rng_state);
-    rnd_r = params.min + params.range * rnd_r;
-    rnd_i = params.min + params.range * rnd_i;
-    // Random values are cast to integer after scaling by a power of two to facilitate error
-    // testing
-    Element result;
-    if (params.int_scale >= 0) {
-      rnd_r = FloatType(std::llround(rnd_r * params.float_scale_up));
-      rnd_i = FloatType(std::llround(rnd_i * params.float_scale_up));
-      result = {
-        Real(rnd_r * params.float_scale_down),
-        Real(rnd_i * params.float_scale_down)
-      };
-    }
-    else {
-      result = Element(Real(rnd_r), Real(rnd_i));
-    }
-    if (params.exclude_zero >= 0 &&
-        result.real() == Real(0.0) &&
-        result.imag() == Real(0.0)) {
-      if (rnd_r > FloatType(0)) {
-        rnd_r = std::min(params.min + params.range, rnd_r + FloatType(1));
-      } else {
-        rnd_r = std::max((params.min), rnd_r - FloatType(1));
-      }
-      result = Element(Real(rnd_r), Real(rnd_i));
-    }
-    return result;
-  }
-};
-/// Computes a random uniform distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomUniformFunc {
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-  /// Scalar type
-  typedef typename TensorView::Element T;
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-  using RandomFunc = RandomUniformFunc<Element>;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    TensorView view;
-    typename RandomFunc::Params random;
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      typename RandomFunc::Params random_ = RandomFunc::Params()
-    ):
-      view(view_), random(random_) {
-    }
-  };
-  //
-  // Data members
-  //
-  Params params;
-  RandomFunc random;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillRandomUniformFunc(Params const &params): params(params), random(params.random) {
-  }
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    params.view.at(coord) = random();
-  }
-};
-} // namespace detail
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomUniform(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  typename RealType<Element>::Type max = Element(1), ///< upper bound of distribution
-  typename RealType<Element>::Type min = Element(0), ///< lower bound for distribution
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  double pnan = 0,                        ///< Percentage of NaN elements.
-  int exclude_zero = -1,               ///< If non-negative, excludes zeros from tensor init
-  cudaStream_t stream = nullptr) {
-  using RandomFunc = detail::RandomUniformFunc<Element>;
-  using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
-  using Params = typename Func::Params;
-  typename RandomFunc::Params random(seed, max, min, bits, pnan, exclude_zero);
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, random),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a tensor with random values with a uniform random distribution.
-template <typename Element>
-void BlockFillRandomUniform(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                          ///< seed for RNG
-  typename RealType<Element>::Type max,   ///< upper bound of distribution
-  typename RealType<Element>::Type min,   ///< lower bound for distribution
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  double pnan = 0,                        ///< Percentage of NaN elements.
-  cudaStream_t stream = nullptr) {
-  using RandomFunc = detail::RandomUniformFunc<Element>;
-  typename RandomFunc::Params params(seed, max, min, bits, pnan);
-  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-/// Computes a random sparse meta
-template <typename Element>               ///< Element type
-struct RandomSparseMetaFunc {
-  using FloatType = float;
-  using IntType = int32_t;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    uint64_t seed;
-    FloatType range;
-    int MetaSizeInBits;
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0,
-      int MetaSizeInBits_ = 2
-    ):
-      seed(seed_),
-      MetaSizeInBits(MetaSizeInBits_) {
-      if (MetaSizeInBits_ == 2) {
-        range = 6;
-      }
-      else if (MetaSizeInBits_ == 4) {
-        range = 2;
-      }
-      else {
-        throw std::invalid_argument("Invalid MetaSizeInBits");
-      }
-    }
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  /// RNG state object
-  curandState_t rng_state;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomSparseMetaFunc(Params const &params): params(params) {
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-    Element FourToTwoMeta[6] = {0x4, 0x8, 0x9, 0xc, 0xd, 0xe};
-    Element TwoToOneMeta[2] = {0x4, 0xe};
-    Element *MetaArray =
-        (params.MetaSizeInBits == 2) ? FourToTwoMeta : TwoToOneMeta;
-    Element result = 0x0;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < cutlass::sizeof_bits<Element>::value / 4; ++i) {
-      FloatType rnd = random_uniform_float<FloatType>(&rng_state);
-      rnd = params.range * rnd;
-      Element meta = MetaArray[(int)rnd];
-      result = (Element)(result | ((Element)(meta << (i * 4))));
-    }
-    return result;
-  }
-};
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomSparseMetaFunc {
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-  /// Scalar type
-  typedef typename TensorView::Element T;
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-  using RandomFunc = RandomSparseMetaFunc<Element>;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    TensorView view;
-    typename RandomFunc::Params random;
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      typename RandomFunc::Params random_ = RandomFunc::Params()
-    ):
-      view(view_), random(random_) {
-    }
-  };
-  //
-  // Data members
-  //
-  Params params;
-  RandomFunc random;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillRandomSparseMetaFunc(Params const &params): params(params), random(params.random) {
-  }
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    params.view.at(coord) = random();
-  }
-};
-} // namespace detail
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomSparseMeta(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  int MetaSizeInBits = 2,                 ///< meta data size
-  cudaStream_t stream = nullptr) {
-  using RandomFunc = detail::RandomSparseMetaFunc<Element>;
-  using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
-  using Params = typename Func::Params;
-  typename RandomFunc::Params random(seed, MetaSizeInBits);
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, random),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a tensor with random values with a uniform random distribution.
-template <typename Element>
-void BlockFillRandomSparseMeta(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                          ///< seed for RNG
-  int MetaSizeInBits = 2,                 ///< meta data size
-  cudaStream_t stream = nullptr) {
-  using RandomFunc = detail::RandomSparseMetaFunc<Element>;
-  typename RandomFunc::Params params(seed, MetaSizeInBits);
-  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-/// Functor to fill a tensor with zeros off the diagonal and a uniform value on the diagonal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillDiagonalFunc {
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-  /// Scalar type
-  typedef typename TensorView::Element T;
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    TensorView view;
-    Element diag;
-    Element other;
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    //
-    // Methods
-    //
-    Params(
-      TensorView view_ = TensorView(),
-      Element diag_ = Element(1),
-      Element other_ = Element(0)
-    ):
-      view(view_), diag(diag_), other(other_) {
-    }
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillDiagonalFunc(Params const &params): params(params) {
-  }
-  /// Updates the tensor
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    bool is_diag = true;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-    params.view.at(coord) = (is_diag ? params.diag : params.other);
-  }
-};
-// Overwrites the elements of a tensor with a uniform value depending on fill mode
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillPartialFunc {
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-  /// Scalar type
-  typedef typename TensorView::Element T;
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    TensorView view;
-    Element element;
-    FillMode fill_mode;
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params(): fill_mode(FillMode::kNone) { }
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,
-      Element element_,
-      FillMode fill_mode_
-    ):
-      view(view_), element(element_), fill_mode(fill_mode_) {
-    }
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  //
-  // Methods
-  //
-  CUTLASS_DEVICE
-  TensorFillPartialFunc(Params const &params): params(params) {
-  }
-  /// Overwrites the element if it is within the covered region.
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    bool predicate = true;
-    switch (params.fill_mode) {
-    case FillMode::kFull:
-      predicate = true;
-      break;
-    case FillMode::kLower:
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 1; i < Layout::kRank; ++i) {
-        if (coord[i - 1] < coord[i]) {
-          predicate = false;
-          break;
-        }
-      }
-      break;
-    case FillMode::kUpper:
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 1; i < Layout::kRank; ++i) {
-        if (coord[i - 1] > coord[i]) {
-          predicate = false;
-          break;
-        }
-      }
-      break;
-    case FillMode::kDiagonal:
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 1; i < Layout::kRank; ++i) {
-        if (coord[i - 1] != coord[i]) {
-          predicate = false;
-          break;
-        }
-      }
-      break;
-    case FillMode::kNone: // fall-through
-    default:
-      predicate = false;
-      break;
-    }
-    if (predicate) {
-      params.view.at(coord) = params.element;
-    }
-  }
-};
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorClearPartialFunc {
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-  /// Scalar type
-  typedef typename TensorView::Element T;
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-  ///
-  static_assert((Layout::kRank == 2), "TensorClearPartial is only supported for matrices");
-  /// Parameters structure
-  struct Params {
-    TensorView view{};
-    Element element{};
-    FillMode fill_mode{FillMode::kNone};
-    int alignment{0};
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  //
-  // Methods
-  //
-  CUTLASS_DEVICE
-  TensorClearPartialFunc(Params const &params): params(params) {
-  }
-  /// Overwrites the element if it is within the covered region.
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    bool predicate = true;
-    switch (params.fill_mode) {
-    case FillMode::kLower:
-      if ((coord[0] >= coord[1]) ||
-          ((coord[1] - coord[0]) >= params.alignment))  {
-          predicate = false;
-        break;
-      }
-      break;
-    case FillMode::kUpper:
-      if ((coord[0] <= coord[1]) ||
-          ((coord[0] - coord[1]) >= params.alignment))  {
-          predicate = false;
-        break;
-      }
-      break;
-    case FillMode::kNone: // fall-through
-    default:
-      predicate = false;
-      break;
-    }
-    if (predicate) {
-      params.view.at(coord) = params.element;
-    }
-  }
-};
-} // namespace detail
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a tensor everywhere with a unique value for its diagonal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillDiagonal(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  Element diag = Element(1),              ///< value to write in the diagonal
-  Element other = Element(0),             ///< value to write off the diagonal
-  cudaStream_t stream = nullptr) {
-  typedef detail::TensorFillDiagonalFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, diag, other),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-/// Fills a tensor partially depending on fill mode. Elements not covered by the fillmode are
-/// not written.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillPartial(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  Element element,
-  FillMode fill_mode,
-  cudaStream_t stream = nullptr) {
-  typedef detail::TensorFillPartialFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, element, fill_mode),
-    stream
-  );
-}
-/// Clears a tensor partially depending on fill mode and alignment. Elements on the wrong-side
-/// of fillmode (upto the alignment) are overwritten with the user supplied element (typically zeros)
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorClearPartial(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  Element element,
-  FillMode fill_mode,
-  int alignment,
-  cudaStream_t stream = nullptr) {
-  typedef detail::TensorClearPartialFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params{view, element, fill_mode, alignment},
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a tensor with a uniform value
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFill(
-  TensorView<Element, Layout> view,         ///< destination tensor
-  Element val = Element(0),                 ///< value to uniformly fill it with
-  cudaStream_t stream = nullptr) {
-  TensorFillDiagonal(view, val, val, stream);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a tensor's diagonal with 1 and 0 everywhere else.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillIdentity(
-  TensorView<Element, Layout> view,                 ///< destination tensor
-  cudaStream_t stream = nullptr) {
-  TensorFillDiagonal(view, Element(1), Element(0), stream);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorUpdateDiagonalFunc {
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-  /// Scalar type
-  typedef typename TensorView::Element T;
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    TensorView view;
-    Element diag;
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      Element diag_ = Element(1)
-    ):
-      view(view_), diag(diag_) {
-    }
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorUpdateDiagonalFunc(Params const &params): params(params) {
-  }
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    bool is_diag = true;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-    if (is_diag) {
-      params.view.at(coord) = params.diag;
-    }
-  }
-};
-} // namespace detail
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Writes a uniform value to the diagonal of a tensor without modifying off-diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorUpdateDiagonal(
-  TensorView<Element, Layout> view,                 ///< destination tensor
-  Element diag = Element(1),
-  cudaStream_t stream = nullptr) {
-  typedef detail::TensorUpdateDiagonalFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, diag),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorUpdateOffDiagonalFunc {
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-  /// Scalar type
-  typedef typename TensorView::Element T;
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    TensorView view;
-    Element other;
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      Element other_ = Element(0)
-    ):
-      view(view_), other(other_) {
-    }
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorUpdateOffDiagonalFunc(Params const &params): params(params) {
-  }
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    bool is_diag = true;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-    if (!is_diag) {
-      params.view.at(coord) = params.other;
-    }
-  }
-};
-} // namespace detail
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Writes a uniform value to all elements in the tensor without modifying diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorUpdateOffDiagonal(
-  TensorView<Element, Layout> view,      ///< destination tensor
-  Element other = Element(1),
-  cudaStream_t stream = nullptr) {
-  typedef detail::TensorUpdateOffDiagonalFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, other),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillLinearFunc {
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-  /// Scalar type
-  typedef typename TensorView::Element T;
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    TensorView view;
-    Array<Element, Layout::kRank> v;
-    Element s;
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,      ///< destination tensor
-      Array<Element, Layout::kRank> const & v_,
-      Element s_ = Element(0)
-    ):
-      view(view_), v(v_), s(s_) {
-    }
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillLinearFunc(Params const &params): params(params) {
-  }
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    Element sum = params.s;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Layout::kRank; ++i) {
-      if constexpr (is_complex<Element>::value) {
-        if constexpr (sizeof_bits<Element>::value <= 32) {
-          sum = Element(static_cast<complex<float>>(sum) +
-                  static_cast<complex<float>>(params.v[i]) * static_cast<complex<float>>(coord[i]));
-        }
-      }
-      else if constexpr (sizeof_bits<Element>::value <= 32) {
-        if constexpr (std::numeric_limits<Element>::is_integer) {
-          sum = Element(static_cast<int32_t>(sum) +
-                  static_cast<int32_t>(params.v[i]) * static_cast<int32_t>(coord[i]));
-        }
-        else {
-          sum = Element(static_cast<float>(sum) +
-                  static_cast<float>(params.v[i]) * static_cast<float>(coord[i]));
-        }
-      }
-      else {
-        sum += params.v[i] * coord[i];
-      }
-    }
-    params.view.at(coord) = sum;
-  }
-};
-} // namespace detail
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills tensor with a linear combination of its coordinate and another vector
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillLinear(
-  TensorView<Element, Layout> view,      ///< destination tensor
-  Array<Element, Layout::kRank> const & v,
-  Element s = Element(0),
-  cudaStream_t stream = nullptr) {
-  using Func = detail::TensorFillLinearFunc<Element, Layout>;
-  using Params = typename Func::Params;
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, v, s),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a tensor with random values from a distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandom(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,
-  Distribution dist,
-  cudaStream_t stream = nullptr,
-  int exclude_zero = -1                   ///< If non-negative, excludes 0.
-                                          ///  Note that setting this flag will result in more 1's,
-                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
-  ) {
-  using Real = typename RealType<Element>::Type;
-  if (dist.kind == Distribution::Gaussian) {
-    TensorFillRandomGaussian<Element, Layout>(
-      view,
-      seed,
-      static_cast<Real>(dist.gaussian.mean),
-      static_cast<Real>(dist.gaussian.stddev),
-      dist.int_scale,
-      exclude_zero,
-      stream);
-  } else if (dist.kind == Distribution::Uniform) {
-    TensorFillRandomUniform<Element, Layout>(
-      view,
-      seed,
-      static_cast<Real>(dist.uniform.max),
-      static_cast<Real>(dist.uniform.min),
-      dist.int_scale,
-      dist.uniform.pnan,
-      exclude_zero,
-      stream);
-  }
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillSequential(
-  Element *ptr,
-  int64_t capacity,
-  Element v = Element(1),
-  Element s = Element(0)) {
-  using Layout = layout::PackedVectorLayout;
-  Layout::TensorCoord size(static_cast<Layout::Index>(capacity)); // -Wconversion
-  Layout layout = Layout::packed(size);
-  TensorView<Element, Layout> view(ptr, layout, size);
-  Array<Element, Layout::kRank> c{};
-  c[0] = v;
-  TensorFillLinear(view, c, s);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillRandom(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,
-  Distribution dist,
-  cudaStream_t stream = nullptr) {
-  using Real = typename RealType<Element>::Type;
-  if (dist.kind == Distribution::Gaussian) {
-    BlockFillRandomGaussian<Element>(
-      ptr,
-      capacity,
-      seed,
-      static_cast<Real>(dist.gaussian.mean),
-      static_cast<Real>(dist.gaussian.stddev),
-      dist.int_scale,
-      stream);
-  }
-  else if (dist.kind == Distribution::Uniform) {
-    BlockFillRandomUniform<Element>(
-      ptr,
-      capacity,
-      seed,
-      static_cast<Real>(dist.uniform.max),
-      static_cast<Real>(dist.uniform.min),
-      dist.int_scale,
-      dist.uniform.pnan,
-      stream);
-  }
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorCopyDiagonalInFunc {
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-  /// Scalar type
-  typedef typename TensorView::Element T;
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    TensorView view;
-    Element const *ptr;
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,      ///< destination tensor
-      Element const *ptr_
-    ):
-      view(view_), ptr(ptr_) {
-    }
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorCopyDiagonalInFunc(Params const &params): params(params) {
-  }
-  /// Only update the diagonal element
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    bool is_diagonal = true;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[0]) {
-        is_diagonal = false;
-      }
-    }
-    if (is_diagonal) {
-      params.view.at(coord) = params.ptr[coord[0]];
-    }
-  }
-};
-} // namespace detail
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Copies a diagonal in from host memory without modifying off-diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorCopyDiagonalIn(
-  TensorView<Element, Layout> view,   ///< destination tensor
-  Element const *ptr,                        ///< dense buffer of elements
-  cudaStream_t stream = nullptr) {
-  using Func = detail::TensorCopyDiagonalInFunc<Element, Layout>;
-  using Params = typename Func::Params;
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, ptr),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorCopyDiagonalOutFunc {
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-  /// Scalar type
-  typedef typename TensorView::Element T;
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    TensorView view;
-    Element *ptr;
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-    //
-    // Methods
-    //
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,      ///< destination tensor
-      Element *ptr_
-    ):
-      view(view_), ptr(ptr_) {
-    }
-  };
-  //
-  // Data members
-  //
-  /// Parameters object
-  Params params;
-  //
-  // Methods
-  //
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorCopyDiagonalOutFunc(Params const &params): params(params) {
-  }
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    bool is_diagonal = true;
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[0]) {
-        is_diagonal = false;
-      }
-    }
-    if (is_diagonal) {
-      params.ptr[coord[0]] = params.view.at(coord);
-    }
-  }
-};
-} // namespace detail
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Copies the diagonal of a tensor into a dense buffer in host memory.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorCopyDiagonalOut(
-  Element *ptr,                               ///< dense buffer of elements
-  TensorView<Element, Layout> view,      ///< source tensor
-  cudaStream_t stream = nullptr) {
-  using Func = detail::TensorCopyDiagonalOutFunc<Element, Layout>;
-  using Params = typename Func::Params;
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, ptr),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace device
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_foreach.h DELETED Viewed

@@ -1,142 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/util/reference/device/kernel/tensor_foreach.h"
-namespace cutlass  {
-namespace reference {
-namespace device {
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Launches a kernel calling a functor for each element in a tensor's index space.
-template <typename Func, int Rank, typename Params>
-struct TensorForEach {
-  /// Constructor performs the operation.
-  TensorForEach(
-    Coord<Rank> size, Params params = Params(),
-    int grid_size = 0, int block_size = 0,
-    cudaStream_t stream = nullptr) {
-    if (!grid_size || !block_size) {
-      // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
-      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
-        &grid_size,
-        &block_size,
-        reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
-      if (result != cudaSuccess) {
-        throw std::runtime_error("Failed to query occupancy.");
-      }
-      // Limit block size. This has the effect of increasing the number of items processed by a
-      // single thread and reduces the impact of initialization overhead.
-      block_size = (block_size < 128 ? block_size : 128);
-    }
-    dim3 grid(grid_size, 1, 1);
-    dim3 block(block_size, 1, 1);
-    kernel::TensorForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(size, params);
-  }
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Launches a kernel calling a functor for each element along a tensor's diagonal
-template <typename Func, int Rank, typename Params>
-struct TensorDiagonalForEach {
-  /// Constructor performs the operation
-  TensorDiagonalForEach(
-    Coord<Rank> size, Params params = Params(),
-    int start = 0, int end = -1,
-    int block_size = 128, cudaStream_t stream = nullptr) {
-    if (end < 0) {
-      end = size.min();
-    }
-    dim3 block(block_size, 1, 1);
-    dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
-    kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(
-      size, params, start, end);
-  }
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-template <typename Element, typename Func>
-struct BlockForEach {
-  /// Constructor performs the operation.
-  BlockForEach(
-    Element *ptr,
-    size_t capacity,
-    typename Func::Params params = typename Func::Params(),
-    int grid_size = 0,
-    int block_size = 0,
-    cudaStream_t stream = nullptr) {
-    if (!grid_size || !block_size) {
-      // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
-      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
-        &grid_size,
-        &block_size,
-        reinterpret_cast<void const *>(kernel::BlockForEach<Element, Func>));
-      if (result != cudaSuccess) {
-        throw std::runtime_error("Failed to query occupancy.");
-      }
-      // Limit block size. This has the effect of increasing the number of items processed by a
-      // single thread and reduces the impact of initialization overhead.
-      block_size = (block_size < 128 ? block_size : 128);
-    }
-    dim3 grid(grid_size, 1, 1);
-    dim3 block(block_size, 1, 1);
-    kernel::BlockForEach<Element, Func><<< grid, block, 0, stream >>>(ptr, capacity, params);
-  }
-};
-///////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace device
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_reduce.h DELETED Viewed

@@ -1,514 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include <cmath>
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/util/device_memory.h"
-#include "cutlass/util/reference/detail/linear_to_coordinate.h"
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace reference {
-namespace device {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace kernel {
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp,
-  int kBlockSize = 128
->
-__global__ void TensorTransformReducePartial(
-  TensorView<Element, Layout> view,     /// View of the tensor to reduce over
-  ComputeType identity,                 /// Identity element of the reduction operation
-  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
-  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
-  ComputeType *workspace) {             /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
-  int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int64_t size = view.size();
-  __shared__ ComputeType scratchpad[kBlockSize];
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    // Map linear thread ID onto tensor coordinate
-    typename Layout::TensorCoord coord;
-    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view.extent());
-    if (view.contains(coord)) {
-      // Fetch element
-      Element x = view.at(coord);
-      // Transform
-      identity = reduce(identity, transform(x));
-    }
-  }
-  scratchpad[threadIdx.x] = identity;
-  __syncthreads();
-  // One thread performs the final reduction and stores out. This could be enhanced via
-  // a tree reduction and pipelining.
-  if (threadIdx.x == 0) {
-    for (int i = 1; i < kBlockSize; ++i) {
-      identity = reduce(identity, scratchpad[i]);
-    }
-    workspace[blockIdx.x] = identity;
-  }
-}
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp,
-  int kBlockSize = 128
->
-__global__ void TensorTransformReducePartial(
-  TensorView<Element, Layout> view_A,   /// View of the tensor to reduce over
-  TensorView<Element, Layout> view_B,   /// View of the tensor to reduce over
-  ComputeType identity,                 /// Identity element of the reduction operation
-  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
-  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
-  ComputeType *workspace) {             /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
-  int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
-  auto size = static_cast<int64_t>(view_A.size());
-  __shared__ ComputeType scratchpad[kBlockSize];
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    // Map linear thread ID onto tensor coordinate
-    typename Layout::TensorCoord coord;
-    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view_A.extent());
-    if (view_A.contains(coord)) {
-      // Fetch element
-      Element a = view_A.at(coord);
-      Element b = view_B.at(coord);
-      // Transform
-      identity = reduce(identity, transform(a, b));
-    }
-  }
-  scratchpad[threadIdx.x] = identity;
-  __syncthreads();
-  // One thread performs the final reduction and stores out. This could be enhanced via
-  // a tree reduction and pipelining.
-  if (threadIdx.x == 0) {
-    for (int i = 1; i < kBlockSize; ++i) {
-      identity = reduce(identity, scratchpad[i]);
-    }
-    workspace[blockIdx.x] = identity;
-  }
-}
-template <
-  typename ComputeType,
-  typename ReduceOp,
-  int kBlockSize = 32
->
-__global__ void TensorTransformReduceFinalize(
-  ComputeType *workspace,
-  ComputeType identity,
-  int workspace_size,
-  ReduceOp reduce) {
-  __shared__ ComputeType scratchpad[kBlockSize];
-  for (int idx = threadIdx.x; idx < workspace_size; idx += kBlockSize) {
-    identity = reduce(identity, workspace[idx]);
-  }
-  scratchpad[threadIdx.x] = identity;
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    for (int i = 1; i < kBlockSize; ++i) {
-      identity = reduce(identity, scratchpad[i]);
-    }
-    workspace[0] = identity;
-  }
-}
-} // namespace kernel
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Transform-reduce operation over the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view,     /// View of the tensor to reduce over
-  ComputeType identity,                 /// Identity element of the reduction operation
-  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
-  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
-  ComputeType *workspace,               /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
-  int workspace_size,                   /// Number of elements in workspace
-  cudaStream_t stream = nullptr,        /// CUDA stream to launch into
-  bool copy_out = true                  /// If true, the value of workspace[0] is copied to host and returned. Otherwise, `identity` is returned.
-) {
-  int const kBlockSize = 128;
-  dim3 block(kBlockSize, 1);
-  dim3 grid(workspace_size, 1);
-  kernel::TensorTransformReducePartial<
-    Element, Layout, ComputeType, ReduceOp, TransformOp, kBlockSize
-  ><<< grid, block, 0, stream >>>(
-    view, identity, reduce, transform, workspace
-  );
-  int const kFinalizeBlockSize = 32;
-  kernel::TensorTransformReduceFinalize<
-    ComputeType, ReduceOp, kFinalizeBlockSize
-  ><<< dim3(1, 1), dim3(kFinalizeBlockSize, 1), 0, stream >>>(
-    workspace, identity, workspace_size, reduce
-  );
-  cudaStreamSynchronize(stream);
-  if (copy_out) {
-    cudaError_t result = cudaMemcpy(&identity, workspace, sizeof(identity), cudaMemcpyDeviceToHost);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaMemcpy() failed");
-    }
-  }
-  return identity;
-}
-/// Transform-reduce operation over the elements of two tensors, zipped together
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view_A,   /// View of the tensor to reduce over
-  TensorView<Element, Layout> view_B,   /// View of the tensor to reduce over
-  ComputeType identity,                 /// Identity element of the reduction operation
-  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
-  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
-  ComputeType *workspace,               /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
-  int workspace_size,                   /// Number of elements in workspace
-  cudaStream_t stream = nullptr,        /// CUDA stream to launch into
-  bool copy_out = true                  /// If true, the value of workspace[0] is copied to host and returned. Otherwise, `identity` is returned.
-) {
-  if (view_A.extent() != view_B.extent()) {
-    throw std::runtime_error("Extents must be equal.");
-  }
-  int const kBlockSize = 128;
-  dim3 block(kBlockSize, 1);
-  dim3 grid(workspace_size, 1);
-  kernel::TensorTransformReducePartial<
-    Element, Layout, ComputeType, ReduceOp, TransformOp, kBlockSize
-  ><<< grid, block, 0, stream >>>(
-    view_A, view_B, identity, reduce, transform, workspace
-  );
-  int const kFinalizeBlockSize = 32;
-  kernel::TensorTransformReduceFinalize<
-    ComputeType, ReduceOp, kFinalizeBlockSize
-  ><<< dim3(1, 1), dim3(kFinalizeBlockSize, 1), 0, stream >>>(
-    workspace, identity, workspace_size, reduce
-  );
-  cudaStreamSynchronize(stream);
-  if (copy_out) {
-    cudaError_t result = cudaMemcpy(&identity, workspace, sizeof(identity), cudaMemcpyDeviceToHost);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaMemcpy() failed");
-    }
-  }
-  return identity;
-}
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view,
-  ComputeType identity,
-  ReduceOp reduce,
-  TransformOp transform,
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-  // Optionally query for the SM count to size the workspace.
-  if (!workspace_size) {
-    int device_idx = 0;
-    cudaDeviceProp prop;
-    cudaError_t result = cudaGetDevice(&device_idx);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDevice() failed");
-    }
-    result = cudaGetDeviceProperties(&prop, device_idx);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDeviceProp() failed");
-    }
-    workspace_size = int(prop.multiProcessorCount);
-  }
-  DeviceAllocation<ComputeType> workspace(workspace_size);
-  ComputeType output = TensorTransformReduce(
-    view,
-    identity,
-    reduce,
-    transform,
-    workspace.get(),
-    workspace_size,
-    stream,
-    true);
-  return output;
-}
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity,
-  ReduceOp reduce,
-  TransformOp transform,
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-  // Optionally query for the SM count to size the workspace.
-  if (!workspace_size) {
-    int device_idx = 0;
-    cudaDeviceProp prop;
-    cudaError_t result = cudaGetDevice(&device_idx);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDevice() failed");
-    }
-    result = cudaGetDeviceProperties(&prop, device_idx);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDeviceProp() failed");
-    }
-    workspace_size = int(prop.multiProcessorCount);
-  }
-  DeviceAllocation<ComputeType> workspace(workspace_size);
-  ComputeType output = TensorTransformReduce(
-    view_A,
-    view_B,
-    identity,
-    reduce,
-    transform,
-    workspace.get(),
-    workspace_size,
-    stream,
-    true);
-  return output;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Helper to compute the sum of the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = Element
->
-ComputeType TensorSum(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-  plus<ComputeType> reduce;
-  NumericConverter<ComputeType, Element> transform;
-  return TensorTransformReduce(
-    view, identity, reduce, transform, stream, workspace_size);
-}
-/// Helper to compute the sum of the squares of the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = Element
->
-ComputeType TensorSumSq(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-  plus<ComputeType> reduce;
-  magnitude_squared<Element, ComputeType> transform;
-  return TensorTransformReduce(
-    view, identity, reduce, transform, stream, workspace_size);
-}
-/// Helper to compute the norm of the elements of a tensor.
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorNorm(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-  return std::sqrt(TensorSumSq(view, identity, stream, workspace_size));
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Helper to compute the sum of the squares of the differences of two tensors
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorSumSqDiff(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-  plus<ComputeType> reduce;
-  magnitude_squared_difference<Element, ComputeType> transform;
-  return TensorTransformReduce(
-    view_A, view_B, identity, reduce, transform, stream, workspace_size);
-}
-/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorNormDiff(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity, stream, workspace_size));
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace device
-} // namespace reference
-} // namespace cutlass
-/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/tensor_relu.h DELETED Viewed

@@ -1,141 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
-    in this header are not specialized for any particular data layout and are therefore not
-    intended to offer the best possible performance. Rather, they are intended to be generic
-    reference implementations to support the CUTLASS unit tests.
-*/
-#pragma once
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/util/reference/device/tensor_foreach.h"
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cutlass {
-namespace reference {
-namespace device {
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorReLuFunc {
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-  /// Coordinate in tensor's index space
-  using TensorCoord = typename TensorView::TensorCoord;
-  /// Parameters structure
-  struct Params {
-    //
-    // Data members
-    //
-    TensorView view;
-    Element threshold;
-    //
-    // Methods
-    //
-    Params(
-      TensorView view_ = TensorView(),
-      Element threshold_ = Element(0)
-    ):
-      view(view_), threshold(threshold_) {
-    }
-  };
-  //
-  // Data members
-  //
-  Params params;
-  //
-  // Methods
-  //
-  CUTLASS_DEVICE
-  TensorReLuFunc(Params const &params): params(params) {
-  }
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    Element const & value = params.view.at(coord);
-    params.view.at(coord) = (value < params.threshold) ? params.threshold : value;
-  }
-};
-} // namespace detail
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Apply ReLu on a tensor
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorReLu(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  Element threshold = Element(0)) {         ///< ReLu threshold
-  using Func = detail::TensorReLuFunc<Element, Layout>;
-  using Params = typename Func::Params;
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, threshold)
-  );
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace device
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/device/thread/gemm.h DELETED Viewed

@@ -1,186 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-#pragma once
-#include "cutlass/coord.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-namespace cutlass {
-namespace reference {
-namespace device {
-namespace thread {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Thread-level blocked general matrix product.
-//
-// Note, this is a reference implementation. Performance is not expected to approach peak.
-//
-template <
-  typename TensorRefA,
-  typename TensorRefB,
-  typename TensorRefC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename OutputTile,
-  typename InnerProductOp = multiply_add<AccumulatorType>,
-  typename ConvertOp = NumericConverter<typename TensorRefC::Element, ScalarType>
->
-struct Gemm {
-  using ElementA = typename TensorRefA::Element;
-  using ElementB = typename TensorRefB::Element;
-  using ElementC = typename TensorRefC::Element;
-  //
-  // Data members
-  //
-  /// Tile for A operand
-  ElementA A_tile[OutputTile::kColumn];
-  /// Tile for B operand
-  ElementB B_tile[OutputTile::kRow];
-  /// Tile for Accumulator
-  AccumulatorType accum[OutputTile::kColumn][OutputTile::kRow];
-  //
-  // Methods
-  //
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Gemm(AccumulatorType initial_accum = AccumulatorType(0)) {
-    // Clear fetch registers
-    for (int i = 0; i < OutputTile::kColumn; ++i) {
-      A_tile[i] = ElementA(0);
-    }
-    for (int j = 0; j < OutputTile::kRow; ++j) {
-      B_tile[j] = ElementB(0);
-    }
-    // Clear accumulators
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < OutputTile::kColumn; ++j) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < OutputTile::kRow; ++i) {
-        accum[j][i] = initial_accum;
-      }
-    }
-  }
-  /// Computes a matrix product
-  CUTLASS_HOST_DEVICE
-  Gemm & multiply_add(
-    gemm::GemmCoord problem_size,
-    TensorRefA tensor_a,
-    TensorRefB tensor_b,
-    MatrixCoord output_coord = MatrixCoord()) {
-    InnerProductOp inner_product_op;
-    // Loop over the GEMM K dimension
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int k = 0; k < problem_size.k(); ++k) {
-      // Fetch a slice of the A matrix
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < OutputTile::kColumn; ++i) {
-        if (output_coord.row() + i < problem_size.m()) {
-          A_tile[i] = tensor_a.at(make_Coord(output_coord.row() + i, k));
-        }
-      }
-      // Fetch a slice of the B matrix
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < OutputTile::kRow; ++j) {
-        if (output_coord.column() + j < problem_size.n()) {
-          B_tile[j] = tensor_b.at(make_Coord(k, output_coord.column() + j));
-        }
-      }
-      // Compute an accumulated matrix product
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < OutputTile::kRow; ++j) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < OutputTile::kColumn; ++i) {
-          accum[j][i] = inner_product_op(A_tile[i], B_tile[j], accum[j][i]);
-        }
-      }
-    }
-    return *this;
-  }
-  /// Performs linear scaling of matrix product and updates output tensor
-  CUTLASS_HOST_DEVICE
-  Gemm & epilogue(
-    gemm::GemmCoord problem_size,
-    ScalarType alpha,
-    ScalarType beta,
-    TensorRefC tensor_c,
-    TensorRefC tensor_d,
-    MatrixCoord output_coord = MatrixCoord()) {
-    ConvertOp convert_op;
-    // Update the output tensor
-    for (int j = 0; j < OutputTile::kRow; ++j) {
-      for (int i = 0; i < OutputTile::kColumn; ++i) {
-        MatrixCoord coord = output_coord + MatrixCoord(i, j);
-        if (coord.row() < problem_size.m() && coord.column() < problem_size.n()) {
-          tensor_d.at(coord) = convert_op(
-            alpha * ScalarType(accum[j][i]) +
-            beta * ScalarType(tensor_c.at(coord))
-          );
-        }
-      }
-    }
-    return *this;
-  }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace thread
-} // namespace device
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/conv.hpp DELETED Viewed

@@ -1,782 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for CONV in host-side code.
-*/
-#pragma once
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cute/tensor.hpp"
-#include <cuda_runtime.h>
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace cutlass::reference::host {
-/////////////////////////////////////////////////////////////////////////////////////////////////
-namespace detail {
-template<class EngineAct, class LayoutAct>
-bool
-is_activation_in_bounds(
-    cute::Tensor<EngineAct, LayoutAct> const& activation,
-    int32_t n_, int32_t d_, int32_t h_, int32_t w_, int32_t c_, int32_t g_) {
-  return ((g_ >= 0 && g_ < size<5>(activation)) &&
-          (n_ >= 0 && n_ < size<4>(activation)) &&
-          (d_ >= 0 && d_ < size<3>(activation)) &&
-          (h_ >= 0 && h_ < size<2>(activation)) &&
-          (w_ >= 0 && w_ < size<1>(activation)) &&
-          (c_ >= 0 && c_ < size<0>(activation)));
-}
-template<class EngineAct, class LayoutAct>
-bool
-is_activation_in_bounds(
-    cute::Tensor<EngineAct, LayoutAct> const& activation,
-    int32_t n_, int32_t h_, int32_t w_, int32_t c_, int32_t g_) {
-  return ((g_ >= 0 && g_ < size<4>(activation)) &&
-          (n_ >= 0 && n_ < size<3>(activation)) &&
-          (h_ >= 0 && h_ < size<2>(activation)) &&
-          (w_ >= 0 && w_ < size<1>(activation)) &&
-          (c_ >= 0 && c_ < size<0>(activation)));
-}
-template<class EngineAct, class LayoutAct>
-bool
-is_activation_in_bounds(
-    cute::Tensor<EngineAct, LayoutAct> const& activation,
-    int32_t n_, int32_t w_, int32_t c_, int32_t g_) {
-  return ((g_ >= 0 && g_ < size<3>(activation)) &&
-          (n_ >= 0 && n_ < size<2>(activation)) &&
-          (w_ >= 0 && w_ < size<1>(activation)) &&
-          (c_ >= 0 && c_ < size<0>(activation)));
-}
-} // namespace detail
-template<
-  class ElementAcc_,
-  class ElementScalar_,
-  class ElementCompute_,
-  class ElementC_,
-  class ElementOut_,
-  bool ResidualAdd_,
-  class TensorAlpha_,
-  class TensorBeta_,
-  class TensorBias_,
-  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>
->
-struct ConvEpilogueFusionParams {
-  using ElementAcc = ElementAcc_;
-  using ElementScalar = ElementScalar_;
-  using ElementCompute = ElementCompute_;
-  using ElementC = ElementC_;
-  using ElementOut = ElementOut_;
-  using TensorAlpha = TensorAlpha_;
-  using TensorBeta = TensorBeta_;
-  using TensorBias = TensorBias_;
-  using ActivationFunctor = ActivationFunctor_;
-  static constexpr bool ResidualAdd = ResidualAdd_; // Source added after activation
-  ElementScalar alpha = ElementScalar(1);
-  ElementScalar beta = ElementScalar(0);
-  TensorAlpha tensor_alpha{};
-  TensorBeta tensor_beta{};
-  TensorBias tensor_bias{};
-};
-template<
-  cutlass::conv::Operator ConvOp,
-  int NumSpatialDims,
-  class TensorA,
-  class TensorB,
-  class TensorC,
-  class TensorD,
-  class ShapePadding,
-  class StrideTraversal,
-  class ShapeDilation,
-  class EpilogueFusionParams
->
-struct ConvReferenceImpl {
-  // Hard code accumlulator type to float to avoid data lost in accumulating add.
-  using ElementAcc = cutlass::platform::conditional_t<cutlass::platform::is_same_v<typename EpilogueFusionParams::ElementAcc, double>, double, float>;
-  using ElementC = typename EpilogueFusionParams::ElementC;
-  using ElementOut = typename EpilogueFusionParams::ElementOut;
-  using ElementScalar = typename EpilogueFusionParams::ElementScalar;
-  using ElementCompute = typename EpilogueFusionParams::ElementCompute;
-  using ElementBias = typename EpilogueFusionParams::TensorBias::value_type;
-  using ActivationFunctor = typename EpilogueFusionParams::ActivationFunctor;
-  // Input related converter
-  NumericConverter<ElementCompute, ElementAcc> acc_converter;
-  NumericConverter<ElementCompute, ElementC> residual_converter;
-  NumericConverter<ElementCompute, ElementBias> bias_converter;
-  // Scale related converter
-  NumericConverter<ElementCompute, ElementScalar> scale_converter;
-  // Output related converter
-  NumericConverter<ElementOut, ElementCompute> output_converter;
-  EpilogueFusionParams& epi_fusion_params_;
-  TensorA const& tensor_a_;
-  TensorB const& tensor_b_;
-  TensorC const& tensor_c_;
-  TensorD& tensor_d_;
-  ShapePadding const& padding_;
-  StrideTraversal const& tstride_;
-  ShapeDilation const& dilation_;
-  // Epilogue activation operation
-  ActivationFunctor epi_activation;
-  ConvReferenceImpl(
-    TensorA const& tensor_a,
-    TensorB const& tensor_b,
-    TensorC const& tensor_c,
-    TensorD& tensor_d,
-    ShapePadding const& padding,
-    StrideTraversal const& tstride,
-    ShapeDilation const& dilation,
-    EpilogueFusionParams& epi_fusion_params)
-  : tensor_a_(tensor_a),
-    tensor_b_(tensor_b),
-    tensor_c_(tensor_c),
-    tensor_d_(tensor_d),
-    padding_(padding),
-    tstride_(tstride),
-    dilation_(dilation),
-    epi_fusion_params_(epi_fusion_params)
-  {
-    static_assert(rank(ShapePadding{}) == rank(ShapeDilation{}));
-    static_assert(rank(ShapePadding{}) == rank(StrideTraversal{}));
-  }
-  void compute_reference() {
-    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
-      fprop_reference(cute::Int<NumSpatialDims>{});
-    }
-    else if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
-      dgrad_reference(cute::Int<NumSpatialDims>{});
-    }
-    else {
-      wgrad_reference(cute::Int<NumSpatialDims>{});
-    }
-  }
-private:
-  // Specialization for 1D fprop kernel
-  void fprop_reference(cute::Int<1> spatial_dims) {
-    int32_t G = size<3>(tensor_d_);
-    int32_t N = size<2>(tensor_d_);
-    int32_t Q = size<1>(tensor_d_);
-    int32_t K = size<0>(tensor_d_);
-    int32_t S = size<1>(tensor_b_);
-    int32_t C = size<0>(tensor_b_);
-#if defined(_OPENMP)
-  #pragma omp parallel for collapse(2)
-#endif
-    for (int32_t g = 0; g < G; ++g) {
-      for (int32_t n = 0; n < N; ++n) {
-        for (int32_t q = 0; q < Q; ++q) {
-          for (int32_t k = 0; k < K; ++k) {
-            auto accumulator = ElementAcc(0);
-            for (int32_t s = 0; s < S; ++s) {
-              for (int32_t c = 0; c < C; ++c) {
-                int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                if (detail::is_activation_in_bounds(tensor_a_, n, w, c, g)) {
-                  auto a = tensor_a_(c, w, n, g);
-                  auto b = tensor_b_(c, s, k, g);
-                  accumulator += ElementAcc(a * b);
-                }
-              }
-            }
-            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-              epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-              epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
-            if (not EpilogueFusionParams::ResidualAdd) {
-              output += scale_converter(beta) * residual_converter(tensor_c_(k, q, n, g));
-            }
-            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-            }
-            output = epi_activation(output);
-            if (EpilogueFusionParams::ResidualAdd) {
-              output += scale_converter(beta) * residual_converter(tensor_c_(k, q, n, g));
-            }
-            tensor_d_(k, q, n, g) = output_converter(output);
-          }
-        }
-      }
-    }
-  }
-  // Specialization for 2D fprop kernel
-  void fprop_reference(cute::Int<2> spatial_dims) {
-    int32_t G = size<4>(tensor_d_);
-    int32_t N = size<3>(tensor_d_);
-    int32_t P = size<2>(tensor_d_);
-    int32_t Q = size<1>(tensor_d_);
-    int32_t K = size<0>(tensor_d_);
-    int32_t R = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-    int32_t C = size<0>(tensor_b_);
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t g = 0; g < G; ++g) {
-      for (int32_t n = 0; n < N; ++n) {
-        for (int32_t p = 0; p < P; ++p) {
-          for (int32_t q = 0; q < Q; ++q) {
-            for (int32_t k = 0; k < K; ++k) {
-              auto accumulator = ElementAcc(0);
-              for (int32_t r = 0; r < R; ++r) {
-                for (int32_t s = 0; s < S; ++s) {
-                  for (int32_t c = 0; c < C; ++c) {
-                    int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                    int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                    if (detail::is_activation_in_bounds(tensor_a_, n, h, w, c, g)) {
-                      auto a = tensor_a_(c, w, h, n, g);
-                      auto b = tensor_b_(c, s, r, k, g);
-                      accumulator += ElementAcc(a * b);
-                    }
-                  }
-                }
-              }
-              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-                epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-                epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
-              if (not EpilogueFusionParams::ResidualAdd) {
-                output += scale_converter(beta) * residual_converter(tensor_c_(k, q, p, n, g));
-              }
-              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-              }
-              output = epi_activation(output);
-              if (EpilogueFusionParams::ResidualAdd) {
-                output += scale_converter(beta) * residual_converter(tensor_c_(k, q, p, n, g));
-              }
-              tensor_d_(k, q, p, n, g) = output_converter(output);
-            }
-          }
-        }
-      }
-    }
-  }
-  // Specialization for 3D fprop kernel
-  void fprop_reference(cute::Int<3> spatial_dims) {
-    int32_t G = size<5>(tensor_d_);
-    int32_t N = size<4>(tensor_d_);
-    int32_t Z = size<3>(tensor_d_);
-    int32_t P = size<2>(tensor_d_);
-    int32_t Q = size<1>(tensor_d_);
-    int32_t K = size<0>(tensor_d_);
-    int32_t T = size<3>(tensor_b_);
-    int32_t R = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-    int32_t C = size<0>(tensor_b_);
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t g = 0; g < G; ++g) {
-      for (int32_t n = 0; n < N; ++n) {
-        for (int32_t z = 0; z < Z; ++z) {
-          for (int32_t p = 0; p < P; ++p) {
-            for (int32_t q = 0; q < Q; ++q) {
-              for (int32_t k = 0; k < K; ++k) {
-                auto accumulator = ElementAcc(0);
-                for (int32_t t = 0; t < T; ++t) {
-                  for (int32_t r = 0; r < R; ++r) {
-                    for (int32_t s = 0; s < S; ++s) {
-                      for (int32_t c = 0; c < C; ++c) {
-                        int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                        int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                        int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
-                        if (detail::is_activation_in_bounds(tensor_a_, n, d, h, w, c, g)) {
-                          auto a = tensor_a_(c, w, h, d, n, g);
-                          auto b = tensor_b_(c, s, r, t, k, g);
-                          accumulator += ElementAcc(a * b);
-                        }
-                      }
-                    }
-                  }
-                }
-                ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-                  epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-                ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-                  epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-                ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
-                if (not EpilogueFusionParams::ResidualAdd) {
-                  output += scale_converter(beta) * residual_converter(tensor_c_(k, q, p, z, n, g));
-                }
-                if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                  output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-                }
-                output = epi_activation(output);
-                if (EpilogueFusionParams::ResidualAdd) {
-                  output += scale_converter(beta) * residual_converter(tensor_c_(k, q, p, z, n, g));
-                }
-                tensor_d_(k, q, p, z, n, g) = output_converter(output);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  // Specialization for 1D dgrad kernel
-  void dgrad_reference(cute::Int<1> spatial_dims) {
-    int32_t G = size<3>(tensor_d_);
-    int32_t N = size<2>(tensor_d_);
-    int32_t W = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-    int32_t K = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-#if defined(_OPENMP)
-   #pragma omp parallel for collapse(2)
-#endif
-    for (int32_t g = 0; g < G; ++g) {
-      for (int32_t n = 0; n < N; ++n) {
-        for (int32_t w = 0; w < W; ++w) {
-          for (int32_t c = 0; c < C; ++c) {
-            auto accumulator = ElementAcc(0);
-            for (int32_t k = 0; k < K; ++k) {
-              for (int32_t s = 0; s < S; ++s) {
-                int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
-                if (q % cute::get<0>(tstride_) == 0) {
-                  q /= cute::get<0>(tstride_);
-                } else {
-                  continue;
-                }
-                if (detail::is_activation_in_bounds(tensor_a_, n, q, k, g)) {
-                  accumulator += ElementAcc(tensor_a_(k, q, n, g) * tensor_b_(c, s, k, g));
-                }
-              }
-            }
-            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
-              ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
-              ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
-            if (not EpilogueFusionParams::ResidualAdd) {
-              output += scale_converter(beta) * residual_converter(tensor_c_(c, w, n, g));
-            }
-            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-              output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-            }
-            output = epi_activation(output);
-            if (EpilogueFusionParams::ResidualAdd) {
-              output += scale_converter(beta) * residual_converter(tensor_c_(c, w, n, g));
-            }
-            tensor_d_(c, w, n, g) = output_converter(output);
-          }
-        }
-      }
-    }
-  }
-  // Specialization for 2D dgrad kernel
-  void dgrad_reference(cute::Int<2> spatial_dims) {
-    int32_t G = size<4>(tensor_d_);
-    int32_t N = size<3>(tensor_d_);
-    int32_t H = size<2>(tensor_d_);
-    int32_t W = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-    int32_t K = size<3>(tensor_b_);
-    int32_t R = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t g = 0; g < G; ++g) {
-      for (int32_t n = 0; n < N; ++n) {
-        for (int32_t h = 0; h < H; ++h) {
-          for (int32_t w = 0; w < W; ++w) {
-            for (int32_t c = 0; c < C; ++c) {
-              auto accumulator = ElementAcc(0);
-              for (int32_t k = 0; k < K; ++k) {
-                for (int32_t r = 0; r < R; ++r) {
-                  for (int32_t s = 0; s < S; ++s) {
-                    int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
-                    int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
-                    if (q % cute::get<0>(tstride_) == 0) {
-                      q /= cute::get<0>(tstride_);
-                    } else {
-                      continue;
-                    }
-                    if (p % cute::get<1>(tstride_) == 0) {
-                      p /= cute::get<1>(tstride_);
-                    } else {
-                      continue;
-                    }
-                    if (detail::is_activation_in_bounds(tensor_a_, n, p, q, k, g)) {
-                      accumulator += ElementAcc(tensor_a_(k, q, p, n, g) * tensor_b_(c, s, r, k, g));
-                    }
-                  }
-                }
-              }
-              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
-                ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
-                ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
-              if (not EpilogueFusionParams::ResidualAdd) {
-                output += scale_converter(beta) * residual_converter(tensor_c_(c, w, h, n, g));
-              }
-              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-              }
-              output = epi_activation(output);
-              if (EpilogueFusionParams::ResidualAdd) {
-                output += scale_converter(beta) * residual_converter(tensor_c_(c, w, h, n, g));
-              }
-              tensor_d_(c, w, h, n, g) = output_converter(output);
-            }
-          }
-        }
-      }
-    }
-  }
-  // Specialization for 3D dgrad kernel
-  void dgrad_reference(cute::Int<3> spatial_dims) {
-    int32_t G = size<5>(tensor_d_);
-    int32_t N = size<4>(tensor_d_);
-    int32_t D = size<3>(tensor_d_);
-    int32_t H = size<2>(tensor_d_);
-    int32_t W = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-    int32_t K = size<4>(tensor_b_);
-    int32_t T = size<3>(tensor_b_);
-    int32_t R = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t g = 0; g < G; ++g) {
-      for (int32_t n = 0; n < N; ++n) {
-        for (int32_t d = 0; d < D; ++d) {
-          for (int32_t h = 0; h < H; ++h) {
-            for (int32_t w = 0; w < W; ++w) {
-              for (int32_t c = 0; c < C; ++c) {
-                auto accumulator = ElementAcc(0);
-                for (int32_t k = 0; k < K; ++k) {
-                  for (int32_t t = 0; t < T; ++t) {
-                    for (int32_t r = 0; r < R; ++r) {
-                      for (int32_t s = 0; s < S; ++s) {
-                        int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
-                        int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
-                        int32_t z = d + cute::get<2>(padding_) - t * cute::get<2>(dilation_);
-                        if (q % cute::get<0>(tstride_) == 0) {
-                          q /= cute::get<0>(tstride_);
-                        } else {
-                          continue;
-                        }
-                        if (p % cute::get<1>(tstride_) == 0) {
-                          p /= cute::get<1>(tstride_);
-                        } else {
-                          continue;
-                        }
-                        if (z % cute::get<2>(tstride_) == 0) {
-                          z /= cute::get<2>(tstride_);
-                        } else {
-                          continue;
-                        }
-                        if (detail::is_activation_in_bounds(tensor_a_, n, z, p, q, k, g)) {
-                          accumulator += ElementAcc(tensor_a_(k, q, p, z, n, g) * tensor_b_(c, s, r, t, k, g));
-                        }
-                      }
-                    }
-                  }
-                }
-                ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
-                  ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-                ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
-                  ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-                ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
-                if (not EpilogueFusionParams::ResidualAdd) {
-                  output += scale_converter(beta) * residual_converter(tensor_c_(c, w, h, d, n, g));
-                }
-                if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                  output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-                }
-                output = epi_activation(output);
-                if (EpilogueFusionParams::ResidualAdd) {
-                  output += scale_converter(beta) * residual_converter(tensor_c_(c, w, h, d, n, g));
-                }
-                tensor_d_(c, w, h, d, n, g) = output_converter(output);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  // Specialization for 1D wgrad kernel
-  void wgrad_reference(cute::Int<1> spatial_dims) {
-    int32_t G = size<3>(tensor_d_);
-    int32_t N =
-        size<2>(tensor_a_);
-    int32_t Q =
-        size<1>(tensor_a_);
-    int32_t K =
-        size<0>(tensor_a_);
-    int32_t S = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(2)
-#endif
-    for (int32_t g = 0; g < G; ++g) {
-      for (int32_t k = 0; k < K; ++k) {
-        for (int32_t s = 0; s < S; ++s) {
-          for (int32_t c = 0; c < C; ++c) {
-            auto accumulator = ElementAcc(0);
-            for (int32_t n = 0; n < N; ++n) {
-              for (int32_t q = 0; q < Q; ++q) {
-                int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                bool is_in_bounds =
-                    detail::is_activation_in_bounds(tensor_b_, n, w, c, g);
-                if (is_in_bounds) {
-                  auto act =
-                      tensor_b_(c, w, n, g);
-                  auto xformed_act =
-                      tensor_a_(k, q, n, g);
-                  accumulator += ElementAcc(act * xformed_act);
-                }
-              }
-            }
-            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-              epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-              epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
-            if (not EpilogueFusionParams::ResidualAdd) {
-              output += scale_converter(beta) * residual_converter(tensor_c_(c, s, k, g));
-            }
-            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-              output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-            }
-            output = epi_activation(output);
-            if (EpilogueFusionParams::ResidualAdd) {
-              output += scale_converter(beta) * residual_converter(tensor_c_(c, s, k, g));
-            }
-            tensor_d_(c, s, k, g) = output_converter(output);
-          }
-        }
-      }
-    }
-  }
-  // Specialization for 2D wgrad kernel
-  void wgrad_reference(cute::Int<2> spatial_dims) {
-    int32_t G = size<4>(tensor_d_);
-    int32_t N =
-        size<3>(tensor_a_);
-    int32_t P =
-        size<2>(tensor_a_);
-    int32_t Q =
-        size<1>(tensor_a_);
-    int32_t K =
-        size<0>(tensor_a_);
-    int32_t R = size<2>(tensor_d_);
-    int32_t S = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t g = 0; g < G; ++g) {
-      for (int32_t k = 0; k < K; ++k) {
-        for (int32_t r = 0; r < R; ++r) {
-          for (int32_t s = 0; s < S; ++s) {
-            for (int32_t c = 0; c < C; ++c) {
-              auto accumulator = ElementAcc(0);
-              for (int32_t n = 0; n < N; ++n) {
-                for (int32_t p = 0; p < P; ++p) {
-                  for (int32_t q = 0; q < Q; ++q) {
-                    int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                    int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                    bool is_in_bounds =
-                        detail::is_activation_in_bounds(tensor_b_, n, h, w, c, g);
-                    if (is_in_bounds) {
-                      auto act =
-                          tensor_b_(c, w, h, n, g);
-                      auto xformed_act =
-                          tensor_a_(k, q, p, n, g);
-                      accumulator += ElementAcc(act * xformed_act);
-                    }
-                  }
-                }
-              }
-              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-                epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-                epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
-              if (not EpilogueFusionParams::ResidualAdd) {
-                output += scale_converter(beta) * residual_converter(tensor_c_(c, s, r, k, g));
-              }
-              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-              }
-              output = epi_activation(output);
-              if (EpilogueFusionParams::ResidualAdd) {
-                output += scale_converter(beta) * residual_converter(tensor_c_(c, s, r, k, g));
-              }
-              tensor_d_(c, s, r, k, g) = output_converter(output);
-            }
-          }
-        }
-      }
-    }
-  }
-  // Specialization for 3D wgrad kernel
-  void wgrad_reference(cute::Int<3> spatial_dims) {
-    int32_t G = size<5>(tensor_d_);
-    int32_t N =
-        size<4>(tensor_a_);
-    int32_t Z =
-        size<3>(tensor_a_);
-    int32_t P =
-        size<2>(tensor_a_);
-    int32_t Q =
-        size<1>(tensor_a_);
-    int32_t K =
-        size<0>(tensor_a_);
-    int32_t T = size<3>(tensor_d_);
-    int32_t R = size<2>(tensor_d_);
-    int32_t S = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t g = 0 ; g < G; ++g) {
-      for (int32_t k = 0; k < K; ++k) {
-        for (int32_t t = 0; t < T; ++t) {
-          for (int32_t r = 0; r < R; ++r) {
-            for (int32_t s = 0; s < S; ++s) {
-              for (int32_t c = 0; c < C; ++c) {
-                auto accumulator = ElementAcc(0);
-                for (int32_t n = 0; n < N; ++n) {
-                  for (int32_t z = 0; z < Z; ++z) {
-                    for (int32_t p = 0; p < P; ++p) {
-                      for (int32_t q = 0; q < Q; ++q) {
-                        int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                        int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                        int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
-                        bool is_in_bounds =
-                            detail::is_activation_in_bounds(tensor_b_, n, d, h, w, c, g);
-                        if (is_in_bounds) {
-                          auto act =
-                              tensor_b_(c, w, h, d, n, g);
-                          auto xformed_act =
-                              tensor_a_(k, q, p, z, n, g);
-                          accumulator += ElementAcc(act * xformed_act);
-                        }
-                      }
-                    }
-                  }
-                }
-                ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-                  epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-                ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-                  epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-                ElementCompute output = scale_converter(alpha) * acc_converter(accumulator);
-                if (not EpilogueFusionParams::ResidualAdd) {
-                  output += scale_converter(beta) * residual_converter(tensor_c_(c, s, r, t, k, g));
-                }
-                if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                  output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-                }
-                output = epi_activation(output);
-                if (EpilogueFusionParams::ResidualAdd) {
-                  output += scale_converter(beta) * residual_converter(tensor_c_(c, s, r, t, k, g));
-                }
-                tensor_d_(c, s, r, t, k, g) = output_converter(output);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-} // cutlass::reference::host
-/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/convolution.h DELETED Viewed

@@ -1,802 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for convolution in host-side code.
-*/
-#pragma once
-#include "cutlass/coord.h"
-#include "cutlass/functional.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include <iostream>
-namespace cutlass {
-namespace reference {
-namespace host {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Forward propagation
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// y = conv2d(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv2dFprop(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementD, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta) {
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < problem_size.N; ++n) {
-    for (int p = 0; p < problem_size.P; ++p) {
-      for (int q = 0; q < problem_size.Q; ++q) {
-        for (int k = 0; k < problem_size.K; ++k) {
-          int group_idx = k / (problem_size.K / problem_size.groups);
-          int channels_per_group = problem_size.C / problem_size.groups;
-          ElementAccumulator acc = ElementAccumulator();
-          for (int r = 0; r < problem_size.R; ++r) {
-            for (int s = 0; s < problem_size.S; ++s) {
-              for (int c = 0; c < channels_per_group; ++c) {
-                int filter_r = r;
-                int filter_s = s;
-                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                  filter_r = problem_size.R - 1 - r;
-                  filter_s = problem_size.S - 1 - s;
-                }
-                int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-                int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-                if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
-                  ElementA a = tensor_x.at({n, h, w, c + group_idx * channels_per_group});
-                  ElementB b = tensor_w.at({k, r, s, c});
-                  acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-                }
-              }
-            }
-          }
-          // Apply Epilogue, compute ElementCompute, convert and store ElementC
-          ElementC c_ref = ElementC();
-          if (beta != ElementCompute()) {
-            c_ref = tensor_y_in.at(cutlass::make_Coord(n, p, q, k));
-          }
-          tensor_y_out.at(cutlass::make_Coord(n, p, q, k)) =
-              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-        }
-      }
-    }
-  }
-}
-/// Depthwise-separable convolution
-template <typename ElementA,
-          typename LayoutA,
-          typename ElementB,
-          typename LayoutB,
-          typename ElementC,
-          typename LayoutC,
-          typename ElementCompute,
-          typename ElementAccumulator = ElementCompute,
-          typename ElementD = ElementC,
-          typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-          typename InnerProductOp = multiply_add<ElementAccumulator>>
-void Depsep_Fprop(cutlass::TensorView<ElementA, LayoutA> tensor_A,
-                  cutlass::TensorView<ElementB, LayoutB> tensor_B,
-                  cutlass::TensorView<ElementC, LayoutC> tensor_C,
-                  cutlass::TensorView<ElementD, LayoutC> tensor_D,
-                  ElementCompute alpha,
-                  ElementCompute beta,
-                  cutlass::Tensor4DCoord padding = cutlass::Tensor4DCoord(),
-                  cutlass::Coord<2> conv_stride = cutlass::Coord<2>(),
-                  cutlass::Coord<2> dilation = cutlass::Coord<2>(),
-                  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation) {
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < tensor_C.extent().n(); ++n) {
-    for (int p = 0; p < tensor_C.extent().h(); ++p) {
-      for (int q = 0; q < tensor_C.extent().w(); ++q) {
-        for (int g = 0; g < tensor_C.extent().c(); ++g) {
-          ElementAccumulator acc = ElementAccumulator();
-          for (int r = 0; r < tensor_B.extent().h(); ++r) {
-            for (int s = 0; s < tensor_B.extent().w(); ++s) {
-              // input activation H and W
-              int h = p * conv_stride[0] - padding[0] + r * dilation[0];
-              int w = q * conv_stride[1] - padding[2] + s * dilation[1];
-              if (h < tensor_A.extent().h() && h >= 0 && w < tensor_A.extent().w() && w >= 0) {
-                ElementA a = tensor_A.at(cutlass::make_Coord(n, h, w, g));
-                ElementB b = (mode == cutlass::conv::Mode::kCrossCorrelation)
-                                   ? tensor_B.at(cutlass::make_Coord(g, r, s, 0))
-                                   : tensor_B.at(cutlass::make_Coord(
-                                         g, tensor_B.extent().h() - r - 1, tensor_B.extent().w() - s - 1, 0));
-                acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-              }
-            }
-          }
-          // Apply Epilogue, compute ElementCompute, convert and store ElementC
-          ElementC c_ref = tensor_C.at(cutlass::make_Coord(n, p, q, g));
-          tensor_D.at(cutlass::make_Coord(n, p, q, g)) =
-              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-        }
-      }
-    }
-  }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Dgrad / Deconv
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv2dDgrad(
-  cutlass::conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementD, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  bool is_deconv = false) {
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < problem_size.N; ++n) {
-    for (int h = 0; h < problem_size.H; ++h) {
-      for (int w = 0; w < problem_size.W; ++w) {
-        for (int c = 0; c < problem_size.C; ++c) {
-          ElementAccumulator acc = ElementAccumulator();
-          for (int r = 0; r < problem_size.R; ++r) {
-            for (int s = 0; s < problem_size.S; ++s) {
-              for (int k = 0; k < problem_size.K; ++k) {
-                int filter_r = r;
-                int filter_s = s;
-                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                  filter_r = problem_size.R - 1 - r;
-                  filter_s = problem_size.S - 1 - s;
-                }
-                int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
-                int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
-                if (p >= 0 && (p % problem_size.stride_h) == 0 &&
-                    q >= 0 && (q % problem_size.stride_w) == 0) {
-                  p = p / problem_size.stride_h;
-                  q = q / problem_size.stride_w;
-#if 0
-                  std::cout << "row:"
-                  << n * problem_size.H * problem_size.W +
-                    h * problem_size.W +
-                    w << " "
-                  << "n, p, q: ("
-                  << n << ", "
-                  << p << ", "
-                  << q << ") * "
-                  << "r, s: ("
-                  << r << ", "
-                  << s << ") ["
-                  << ((p < problem_size.P && q < problem_size.Q) ? "true":"false") << "]"
-                  << std::endl;
-#endif
-                  if (p < problem_size.P && q < problem_size.Q) {
-                    ElementA a = tensor_dy.at(cutlass::make_Coord(n, p, q, k));
-                    ElementB b = is_deconv ? tensor_w.at(cutlass::make_Coord(c, r, s, k))
-                        : tensor_w.at(cutlass::make_Coord(k, r, s, c));
-                    acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-                  }
-                }
-              } // for (K)
-            } // for (S)
-          } // for (R)
-          // Apply Epilogue, compute ElementCompute, convert and store ElementC
-          ElementC c_ref = ElementC();
-          if (beta != ElementCompute()) {
-            c_ref = tensor_dx_in.at(cutlass::make_Coord(n, h, w, c));
-          }
-          tensor_dx_out.at(cutlass::make_Coord(n, h, w, c)) =
-              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-        } // for (C)
-      } // for (W)
-    } // for (H)
-  } // for (N)
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Wgrad
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv2dWgrad(
-  cutlass::conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementD, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta) {
-  InnerProductOp inner_product_op;
-  ConvertOp convert_op;
-  // Apply MMA and accumulate ElementAccumulator
-  for (int k = 0; k < problem_size.K; ++k) {
-    for (int r = 0; r < problem_size.R; ++r) {
-      for (int s = 0; s < problem_size.S; ++s) {
-        for (int c = 0; c < problem_size.C; ++c) {
-          ElementAccumulator acc = ElementAccumulator();
-          for (int n = 0; n < problem_size.N; ++n) {
-            for (int p = 0; p < problem_size.P; ++p) {
-              for (int q = 0; q < problem_size.Q; ++q) {
-                cutlass::Tensor4DCoord b_coord;
-                int filter_r = r;
-                int filter_s = s;
-                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                  filter_r = problem_size.R - 1 - r;
-                  filter_s = problem_size.S - 1 - s;
-                }
-                b_coord = make_Coord(
-                    n,
-                    p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
-                    q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
-                    c);
-                if (b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
-                    b_coord.w() < problem_size.W && b_coord.w() >= 0) {
-                  ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, p, q, k)));
-                  ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
-                  acc = inner_product_op(a, b, acc);
-                }
-              }
-            }
-          }
-          // Apply Epilogue, compute ElementCompute, convert and store ElementC
-          ElementC c_ref = ElementC();
-          if (beta != ElementCompute()) {
-            c_ref = tensor_dw_in.at(cutlass::make_Coord(k, r, s, c));
-          }
-          tensor_dw_out.at(cutlass::make_Coord(k, r, s, c)) =
-              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-        } // for (C)
-      } // for (S)
-    } // for (R)
-  } // for (K)
-}
-/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv2d(
-  conv::Operator convolutional_operator,
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_A,
-  TensorRef<ElementB, LayoutB> tensor_B,
-  TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementD, LayoutC> tensor_D,
-  ElementCompute alpha,
-  ElementCompute beta) {
-  switch (convolutional_operator) {
-  case conv::Operator::kFprop:
-    Conv2dFprop<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ElementD,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
-    break;
-  case conv::Operator::kDeconv:
-  case conv::Operator::kDgrad:
-    Conv2dDgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ElementD,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, (convolutional_operator == conv::Operator::kDeconv));
-    break;
-  case conv::Operator::kWgrad:
-    Conv2dWgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ElementD,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
-    break;
-  default:
-    break;
-  }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// 3D convolution
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// y = conv3d(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv3dFprop(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta) {
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < problem_size.N; ++n) {
-    for (int z = 0; z < problem_size.Z; ++z) {
-      for (int p = 0; p < problem_size.P; ++p) {
-        for (int q = 0; q < problem_size.Q; ++q) {
-          for (int k = 0; k < problem_size.K; ++k) {
-            ElementAccumulator acc = ElementAccumulator();
-            for (int t = 0; t < problem_size.T; ++t) {
-              for (int r = 0; r < problem_size.R; ++r) {
-                for (int s = 0; s < problem_size.S; ++s) {
-                  for (int c = 0; c < problem_size.C; ++c) {
-                    int filter_t = t;
-                    int filter_r = r;
-                    int filter_s = s;
-                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                      filter_t = problem_size.T - 1 - t;
-                      filter_r = problem_size.R - 1 - r;
-                      filter_s = problem_size.S - 1 - s;
-                    }
-                    int d = z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
-                    int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-                    int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-                    if (d >= 0 && d < problem_size.D &&
-                      h >=0 && h < problem_size.H &&
-                      w >= 0 && w < problem_size.W) {
-                      ElementA a = tensor_x.at({n, d, h, w, c});
-                      ElementB b = tensor_w.at({k, t, r, s, c});
-                      acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-                    }
-                  }
-                }
-              }
-            }
-            // Apply Epilogue, compute ElementCompute, convert and store ElementC
-            ElementC c_ref = ElementC();
-            if (beta != ElementCompute()) {
-              c_ref = tensor_y_in.at(cutlass::make_Coord(n, z, p, q, k));
-            }
-            tensor_y_out.at(cutlass::make_Coord(n, z, p, q, k)) =
-                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-          }
-        }
-      }
-    }
-  }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Dgrad / Deconv
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv3dDgrad(
-  cutlass::conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  bool is_deconv = false) {
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < problem_size.N; ++n) {
-    for (int d = 0; d < problem_size.D; ++d) {
-      for (int h = 0; h < problem_size.H; ++h) {
-        for (int w = 0; w < problem_size.W; ++w) {
-          for (int c = 0; c < problem_size.C; ++c) {
-            ElementAccumulator acc = ElementAccumulator();
-            for (int t = 0; t < problem_size.T; ++t) {
-              for (int r = 0; r < problem_size.R; ++r) {
-                for (int s = 0; s < problem_size.S; ++s) {
-                  for (int k = 0; k < problem_size.K; ++k) {
-                    int filter_t = t;
-                    int filter_r = r;
-                    int filter_s = s;
-                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                      filter_t = problem_size.T - 1 - t;
-                      filter_r = problem_size.R - 1 - r;
-                      filter_s = problem_size.S - 1 - s;
-                    }
-                    int z = d + problem_size.pad_d - filter_t * problem_size.dilation_d;
-                    int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
-                    int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
-                    if (z >= 0 && (z % problem_size.stride_d) == 0 &&
-                        p >= 0 && (p % problem_size.stride_h) == 0 &&
-                        q >= 0 && (q % problem_size.stride_w) == 0) {
-                      z = z / problem_size.stride_d;
-                      p = p / problem_size.stride_h;
-                      q = q / problem_size.stride_w;
-                      if (z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
-                        ElementA a = tensor_dy.at(cutlass::make_Coord(n, z, p, q, k));
-                        ElementB b = is_deconv ? tensor_w.at(cutlass::make_Coord(c, t, r, s, k))
-                            : tensor_w.at(cutlass::make_Coord(k, t, r, s, c));
-                        acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-                      }
-                    }
-                  } // for (K)
-                } // for (S)
-              } // for (R)
-            } // for (T)
-            // Apply Epilogue, compute ElementCompute, convert and store ElementC
-            ElementC c_ref = ElementC();
-            if (beta != ElementCompute()) {
-              c_ref = tensor_dx_in.at(cutlass::make_Coord(n, d, h, w, c));
-            }
-            tensor_dx_out.at(cutlass::make_Coord(n, d, h, w, c)) =
-                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-          } // for (C)
-        } // for (W)
-      } // for (H)
-    } // for (D)
-  } // for (N)
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Wgrad
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv3dWgrad(
-  cutlass::conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta) {
-  InnerProductOp inner_product_op;
-  ConvertOp convert_op;
-  // Apply MMA and accumulate ElementAccumulator
-  for (int k = 0; k < problem_size.K; ++k) {
-    for (int t = 0; t < problem_size.T; ++t) {
-      for (int r = 0; r < problem_size.R; ++r) {
-        for (int s = 0; s < problem_size.S; ++s) {
-          for (int c = 0; c < problem_size.C; ++c) {
-            ElementAccumulator acc = ElementAccumulator();
-            for (int n = 0; n < problem_size.N; ++n) {
-              for (int z = 0; z < problem_size.Z; ++z) {
-                for (int p = 0; p < problem_size.P; ++p) {
-                  for (int q = 0; q < problem_size.Q; ++q) {
-                    int filter_t = t;
-                    int filter_r = r;
-                    int filter_s = s;
-                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                      filter_t = problem_size.T - 1 - t;
-                      filter_r = problem_size.R - 1 - r;
-                      filter_s = problem_size.S - 1 - s;
-                    }
-                    Tensor5DCoord b_coord = make_Coord(
-                        n,
-                        z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d,
-                        p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
-                        q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
-                        c);
-                    if (b_coord.d() < problem_size.D && b_coord.d() >= 0 &&
-                        b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
-                        b_coord.w() < problem_size.W && b_coord.w() >= 0) {
-                      ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, z, p, q, k)));
-                      ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
-                      acc = inner_product_op(a, b, acc);
-                    }
-                  }
-                }
-              }
-            }
-            // Apply Epilogue, compute ElementCompute, convert and store ElementC
-            ElementC c_ref = ElementC();
-            if (beta != ElementCompute()) {
-              c_ref = tensor_dw_in.at(cutlass::make_Coord(k, t, r, s, c));
-            }
-            tensor_dw_out.at(cutlass::make_Coord(k, t, r, s, c)) =
-                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-          } // for (C)
-        } // for (S)
-      } // for (R)
-    } // for (T)
-  } // for (K)
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Generic 3D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv3d(
-  conv::Operator convolutional_operator,
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_A,
-  TensorRef<ElementB, LayoutB> tensor_B,
-  TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementC, LayoutC> tensor_D,
-  ElementCompute alpha,
-  ElementCompute beta) {
-  switch (convolutional_operator) {
-  case conv::Operator::kFprop:
-    Conv3dFprop<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
-    break;
-  case conv::Operator::kDeconv:
-  case conv::Operator::kDgrad:
-    Conv3dDgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, (convolutional_operator == conv::Operator::kDeconv));
-    break;
-  case conv::Operator::kWgrad:
-    Conv3dWgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
-    break;
-  default:
-    break;
-  }
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-}  // namespace host
-}  // namespace reference
-}  // namespace cutlass
-/////////////////////////////////////////////////////////////////////////////////////////////////

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/error_metrics.h DELETED Viewed

@@ -1,66 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-#include <cmath>
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/util/reference/host/tensor_reduce.h"
-#include "cutlass/core_io.h"
-namespace cutlass  {
-namespace reference {
-namespace host {
-/// Helper to compute the relative error metric for tensor A_computed  w.r.t. to tensor A_reference
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorRelativeErrorMetric(
-  TensorView<Element, Layout> view_A_computed,
-  TensorView<Element, Layout> view_B_reference,
-  ComputeType identity = ComputeType()
-) {
-  return cutlass::reference::host::TensorNormDiff(view_A_computed, view_B_reference, identity) /
-   cutlass::reference::host::TensorNorm(view_B_reference, identity);
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace host
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gemm.h DELETED Viewed

@@ -1,531 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-#pragma once
-#include "cutlass/coord.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/util/host_tensor.h"
-namespace cutlass {
-namespace reference {
-namespace host {
-template<typename Out, typename In>
-struct CastIfScalar {
-  static Out cast(In in) {
-    return Out(in);
-  }
-};
-template<typename OutScalar, typename In>
-struct CastIfScalar<cutlass::complex<OutScalar>, In> {
-  typedef cutlass::complex<OutScalar> Out;
-  static Out cast(In in) {
-    return Out(static_cast<OutScalar>(in));
-  }
-};
-template<typename OutScalar, typename InScalar>
-struct CastIfScalar<cutlass::complex<OutScalar>, cutlass::complex<InScalar>> {
-  typedef cutlass::complex<OutScalar> Out;
-  typedef cutlass::complex<InScalar> In;
-  static Out cast(In in) {
-    return Out(in);
-  }
-};
-template<typename Out, typename In>
-Out cast_if_scalar(In in) {
-  return CastIfScalar<Out, In>::cast(in);
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum) {
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  for (int row_block = 0; row_block < M; row_block += Mblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-      ComputeType accum[Mblock][Nblock];
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-            if (row < M && col < N) {
-              ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-              ElementB b = tensor_b.at(MatrixCoord(k_block, col));
-              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
-              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
-              accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]);
-            }
-          }
-        }
-      }
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-          MatrixCoord coord = MatrixCoord(row, col);
-          if (row < M && col < N) {
-            tensor_d.at(coord) = convert_op(
-              alpha * ScalarType(accum[i][j]) +
-              beta * ScalarType(tensor_c.at(coord)));
-          }
-        }
-      }
-    }
-  }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  ComputeType initial_accum) {
-  compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
-      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
-      initial_accum);
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
->
-struct Gemm;
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAdd> {
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAddFastBF16> {
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for multiply-add-saturate
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAddSaturate> {
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>,
-                 NumericConverterClamp<ElementC, ScalarType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>,
-                 NumericConverterClamp<ElementC, ScalarType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for XOR-popc
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpXorPopc> {
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, xor_popc_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, xor_popc_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-/// Partial specialization for AND-popc
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpAndPopc> {
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, and_popc_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, and_popc_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAddFastF32> {
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Batched GEMM
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a batch of GEMMs over a set of matrices of common dimension.
-//
-// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
-//
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType
->
-void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  int batch_count,
-  ScalarType alpha,
-  TensorRefCollectionA const& tensor_a,
-  TensorRefCollectionB const& tensor_b,
-  ScalarType beta,
-  TensorRefCollectionC &tensor_c,
-  AccumulatorType initial_accum) {
-  typename TensorRefCollectionA::ConstIterator tensor_a_it = tensor_a.begin();
-  typename TensorRefCollectionB::ConstIterator tensor_b_it = tensor_b.begin();
-  typename TensorRefCollectionC::ConstIterator tensor_c_it = tensor_c.begin();
-  for (int batch = 0;
-    batch < batch_count;
-    ++batch, ++tensor_a_it, ++tensor_b_it, ++tensor_c_it) {
-    Gemm<typename TensorRefCollectionA::Element,
-         typename TensorRefCollectionA::Layout,
-         typename TensorRefCollectionB::Element,
-         typename TensorRefCollectionB::Layout,
-         typename TensorRefCollectionC::Element,
-         typename TensorRefCollectionC::Layout,
-         typename TensorRefCollectionC::Element,
-         typename TensorRefCollectionC::Element>
-        gemm;
-    gemm(problem_size, alpha, *tensor_a_it, *tensor_b_it, beta, *tensor_c_it,
-         initial_accum);
-  }
-}
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-//
-// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
-//
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType
->
-void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  int batch_count,
-  ScalarType alpha,
-  TensorRefCollectionA const& tensor_a,
-  TensorRefCollectionB const& tensor_b,
-  ScalarType beta,
-  TensorRefCollectionC &tensor_c) {
-  BatchedGemm(problem_size, batch_count, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace host
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gemm_complex.h DELETED Viewed

@@ -1,210 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in host-side code.
-*/
-#pragma once
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-namespace cutlass {
-namespace reference {
-namespace host {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
-    // Compute matrix product using blocks
-    for (int row_block = 0; row_block < M; row_block += Mblock) {
-      for (int col_block = 0; col_block < N; col_block += Nblock) {
-        ComputeType accum[Mblock][Nblock];
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            accum[i][j] = initial_accum;
-          }
-        }
-        for (int k_block = 0; k_block < K; ++k_block) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-              if (row < M && col < N) {
-                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-                ElementB b = tensor_b.at(MatrixCoord(k_block, col));
-                ComputeType a_ik = ComputeType(a);
-                ComputeType b_kj = ComputeType(b);
-                if (transform_a == ComplexTransform::kConjugate) {
-                  a_ik = conj(a_ik);
-                }
-                if (transform_b == ComplexTransform::kConjugate) {
-                  b_kj = conj(b_kj);
-                }
-                accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
-              }
-            }
-          }
-        }
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-            MatrixCoord coord = MatrixCoord(row, col);
-            if (row < M && col < N) {
-              tensor_d.at(coord) = convert_op(
-                alpha * ScalarType(accum[i][j]) +
-                beta * ScalarType(tensor_c.at(coord)));
-            }
-          }
-        }
-      } // for (col_block)
-    } // for (row_block)
-    tensor_a.add_pointer_offset(batch_stride_A);
-    tensor_b.add_pointer_offset(batch_stride_B);
-    tensor_c.add_pointer_offset(batch_stride_C);
-    tensor_d.add_pointer_offset(batch_stride_D);
-  } // for (batch_idx)
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ElementD = ElementC
->
-void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d) {
-  GemmComplex(problem_size, alpha, tensor_a, transform_a, tensor_b, transform_b, beta, tensor_c, tensor_d, ScalarType(0));
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace host
-} // namespace reference
-} // namespace cutlass

build/torch211-cxx11-cu126-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h DELETED Viewed

@@ -1,228 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in host-side code.
-*/
-#pragma once
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_ref_planar_complex.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-namespace cutlass {
-namespace reference {
-namespace host {
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<complex<ComputeType>>
->
-void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
-  complex<ComputeType> initial_accum) {
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-  using ComplexA = typename TensorRefPlanarComplex<ElementA, LayoutA>::ComplexElement;
-  using ComplexB = typename TensorRefPlanarComplex<ElementB, LayoutB>::ComplexElement;
-  using ComplexC = typename TensorRefPlanarComplex<ElementC, LayoutC>::ComplexElement;
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  for (int row_block = 0; row_block < M; row_block += Mblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-      complex<ComputeType> accum[Mblock][Nblock];
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-            if (row < M && col < N) {
-              ComplexA a_ik = tensor_a.at(MatrixCoord(row, k_block));
-              ComplexB b_kj = tensor_b.at(MatrixCoord(k_block, col));
-              complex<ComputeType> a = complex<ComputeType>{
-                ComputeType(a_ik.real()),
-                ComputeType(a_ik.imag())
-              };
-              complex<ComputeType> b = complex<ComputeType>{
-                ComputeType(b_kj.real()),
-                ComputeType(b_kj.imag())
-              };
-              if (transform_a == ComplexTransform::kConjugate) {
-                a = conj(a);
-              }
-              if (transform_b == ComplexTransform::kConjugate) {
-                b = conj(b);
-              }
-              accum[i][j] = inner_product_op(a, b,  accum[i][j]);
-            }
-          }
-        }
-      }
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-          MatrixCoord coord = MatrixCoord(row, col);
-          if (row < M && col < N) {
-            complex<ScalarType> acc{
-              ScalarType(accum[i][j].real()),
-              ScalarType(accum[i][j].imag())
-            };
-            ComplexC d_ij = tensor_c.at(coord);
-            complex<ScalarType> src{
-              ScalarType(d_ij.real()),
-              ScalarType(d_ij.imag())
-            };
-            complex<ScalarType> result = alpha * acc + beta * src;
-            d_ij.real() = convert_op(result.real());
-            d_ij.imag() = convert_op(result.imag());
-            tensor_d.at(coord) = d_ij;
-          }
-        }
-      }
-    }
-  }
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d) {
-  GemmPlanarComplex(
-    problem_size,
-    alpha,
-    tensor_a, transform_a,
-    tensor_b, transform_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    complex<ScalarType>());
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-////////////////////////////////////////////////////////////////////////////////////////////////////