Kernels:

kernels-community
/

deep-gemm

Trusted publisher

Kernel card Files Files and versions

xet

Community

kernels-bot commited on 2 days ago

Commit

abc8e91

verified ·

1 Parent(s): ba53917

Uploaded using `kernel-builder` (batch 32/32).

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/conv.py +997 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/gemm.py +725 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/gemm_grouped.py +269 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/op.py +431 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/shape.py +184 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/swizzle.py +65 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/__init__.py +41 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/check.py +262 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/datatypes.py +362 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/lazy_import.py +41 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/profiler.py +196 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/__init__.py +63 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/conv2d_operation.py +621 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/conv3d_operation.py +482 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/conv3x_emitter.py +250 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/emit_kernel_listing.py +868 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/gemm_operation.py +1613 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/generator.py +0 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/heuristics.py +415 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/heuristics_provider.py +175 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/library.py +1531 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/manifest.py +868 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/rank_2k_operation.py +438 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/rank_k_operation.py +427 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/sm100_shapes.py +342 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/sm100_utils.py +661 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/sm90_shapes.py +212 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/sm90_utils.py +753 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/symm_operation.py +440 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/trmm_operation.py +447 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/docs_src/source/conf.py +132 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/pycute/__init__.py +36 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/pycute/int_tuple.py +225 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/pycute/layout.py +367 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/pycute/swizzle.py +129 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/pycute/typing.py +42 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/setup_cutlass.py +74 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/setup_library.py +46 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/setup_pycute.py +46 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_problem_sizes.py +661 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_sm80.py +146 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_test_utils.py +428 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/run_all_tests.py +44 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/emit/pytorch.py +309 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_compute_sm80_90.py +198 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_layout_sm80_90.py +173 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_load_sm80_90.py +142 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_mixed_sm80_90.py +319 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_store_sm80_90.py +180 -0
build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/evt/run_all_tests.py +44 -0

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/conv.py ADDED Viewed

	@@ -0,0 +1,997 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+    Ease-of-use interface for constructing, compiling, and running CONVs
+    The ``Conv2d`` interface is meant to allow one to easily instantiate, compile, and run
+    CONV2D operations in CUTLASS via Python, without specifying many configuration parameters.
+    Under the hood, the interface will select sensible default parameters for the many template
+    parameters for CUTLASS CONVs.
+    Note: optimal performance is not to be expected from this interface. To achieve optimal
+    performance, one should specify and tune each configuration parameter.
+    The simplest example of using this interface is the following:
+    .. highlight:: python
+    .. code-block:: python
+        # A, B, C, and D are torch/numpy/cupy tensor objects
+        plan = cutlass_cppgen.op.Conv(A, B, C, D)
+        plan.run(stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+    One can also use the interface by specifying data types of operands at construction
+    and using different tensor objects with these data types at runtime:
+    .. highlight:: python
+    .. code-block:: python
+        # The following is shorthand for:
+        #        cutlass_cppgen.op.Conv2d(kind="fprop",
+        #                          element_A=torch.float32, element_B=torch.float32,
+        #                          element_C=torch.float32, element_D=torch.float32,
+        #                          element_accumulator=torch.float32)
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=torch.float32)
+        A0 = torch.rand((128, 256), dtype=torch.float32, device='cuda')
+        B0 = torch.rand((256, 64), dtype=torch.float32, device='cuda')
+        C0 = torch.zeros((128, 64), dtype=torch.float32, device='cuda')
+        D0 = torch.zeros((128, 64), dtype=torch.float32, device.'cuda')
+        plan.run(A0, B0, C0, D0, stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+        A = torch.rand((32, 128), dtype=torch.float32, device='cuda')
+        B = torch.rand((128, 256), dtype=torch.float32, device='cuda')
+        C = torch.zeros((32, 256), dtype=torch.float32, device='cuda')
+        D = torch.zeros((32, 256), dtype=torch.float32, device.'cuda')
+        plan.run(A1, B1, C1, D1, stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+    The interface additionally enables one to decouple the compilation of the underlying CUTLASS
+    kernel from its execution:
+    .. highlight:: python
+    .. code-block:: python
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=np.float32)
+        # Do other work...
+        plan.run(A0, B0, C0, D0, stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+        # Do other work...
+        plan.run(A1, B1, C1, D1, stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+    Elementwise activation functions are easily fused to the GEMM via the interface:
+    .. highlight:: python
+    .. code-block:: python
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=np.float32)
+        plan.activation = cutlass_cppgen.epilogue.relu
+    Operations can also be run asynchronously:
+    .. highlight:: python
+    .. code-block:: python
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=np.float32)
+        args = plan.run()
+        # Do other work...
+        args.sync()
+"""
+from __future__ import annotations
+from typing import Optional
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+cudart =  lazy_import("cuda.cudart")
+from cutlass_library import (
+    ConvKind,
+    ConvMode,
+    DataTypeSize,
+    IteratorAlgorithm,
+    OperationKind,
+    SplitKMode,
+    StrideSupport,
+)
+import cutlass_cppgen
+from cutlass_cppgen import epilogue
+from cutlass_cppgen.backend import compiler
+from cutlass_cppgen.backend.conv2d_operation import Conv2dArguments, Conv2dOperation
+from cutlass_cppgen.backend.reduction_operation import ReductionOperation, ReductionArguments
+from cutlass_cppgen.backend.library import TensorDescription, TileDescription
+from cutlass_cppgen.op.op import OperationBase
+from cutlass_cppgen.shape import Conv2DProblemSize, MatrixCoord
+from cutlass_cppgen.utils import check, datatypes
+class Conv2d(OperationBase):
+    """
+    Constructs a ``Conv2d`` object.
+    The convolution kind (fprop, wgrad, degrad), the data types of operands A, B, and C,
+    along with the data type of output D and that used for accumulation, are bound to the ``Conv``
+    object throughout its lifetime -- these are not to be changed after a ``Conv2d`` has been constructed.
+    The constructor has optional parameters for flexibly setting these parameters. The following
+    constructors are equivalent:
+    .. highlight:: python
+    .. code-block:: python
+        # Use F32 for A, B, C, D, and accumulation in fprop
+        # Use the generic ``element`` parameter to concisely set all data types for operands to the same values.
+        Conv2d(kind="fprop", element=cutlass_cppgen.DataType.f32)
+        # Explicitly specify the data types to use for A, B, C, and D.
+        Conv2d(kind="fprop", element_A=cutlass_cppgen.DataType.f32, element_B=cutlass_cppgen.DataType.f32,
+            element_C=cutlass_cppgen.DataType.f32, element_D=cutlass_cppgen.DataType.f32)
+        # Set the data types and elements from existing tensors. Note that one can use different tensors when
+        # executing GEMM via the ``run()`` method than passed in here (though those passed in to ``run()`` must
+        # have the same data type as those passed in here).
+        # A, B, C, and D are torch.Tensor objects of type torch.float32 under the channel-last layout
+        Conv2d(kind="fprop", A=A, B=B, C=C, D=D)
+        # Explicitly specify the data type for only some of A, B, C, and D. Unspecified data types will inherit
+        # those passed in via the generic ``element``
+        Conv2d(kind="fprop", element_A=cutlass_cppgen.DataType.f32, element_accumulator=cutlass_cppgen.DataType.f32,
+            element=cutlass_cppgen.DataType.f32)
+    The order of precedence for the setting of the data type for a given operand/output is as follows:
+        1) If the tensor type is specified (e.g., ``A``), use the data type inferred from this tensor
+        2) Otherwise, if the data type (e.g., ``element_A``) is specified, use those
+        3) Otherwise, use the generic values (e.g., ``element``)
+    :param kind: the convolution kind (i.e. fprop, wgrad, and dgrad)
+    :type kind: str
+    :param A: tensor representing data type of operand A
+    :param B: tensor representing data type of operand B
+    :param C: tensor representing data type of operand C
+    :param D: tensor representing data type of operand D
+    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+    :param beta: scalar parameter beta from GEMM operation that scales operand C
+    :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type
+    :type element: cutlass_cppgen.DataType
+    :param element_A: data type to be used for operand A
+    :type element_A: cutlass_cppgen.DataType
+    :param element_B: data type to be used for operand B
+    :type element_B: cutlass_cppgen.DataType
+    :param element_C: data type to be used for operand C
+    :type element_C: cutlass_cppgen.DataType
+    :param element_D: data type to be used for operand D
+    :type element_D: cutlass_cppgen.DataType
+    :param element_accumulator: data type to be used in accumulation of the product of operands A and B
+    :type element_accumulator: cutlass_cppgen.DataType
+    :param cc: compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90
+    :type cc: int
+    :param kernel_cc: compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80
+    :type kernel_cc: int
+    """
+    def __init__(
+        self, kind="fprop",
+        A=None, B=None, C=None, D=None, alpha=1.0, beta=0.0,
+        element=None,
+        element_A=None, element_B=None, element_C=None, element_D=None,
+        element_accumulator=None,
+        cc: int = None, kernel_cc: int = None
+    ):
+        super().__init__(cc=cc, kernel_cc=kernel_cc, operation_kind=OperationKind.Conv2d)
+        # Verify the kernel cc
+        if self.current_cc in [90, 100, 101, 103]:
+            # The Conv2d kernel on Hopper (SM90) is currently unsupported
+            # Revert to use SM80-tagged kernels
+            cutlass_cppgen.logger.warning("Reverting to using SM80-tagged kernel. Opclass may change.")
+            self.specified_kernel_cc = 80
+            self._reset_options(80)
+        # The arch is used in testing
+        self.arch = self.current_cc
+        self.name = "conv2d" + kind
+        # The convolution kind. (concept: cutlass_library.library.ConvKind)
+        self.conv_kind = datatypes.getattr_enum(ConvKind, kind)
+        # The element types (concept: cutlass library types) of A, B, C, and D
+        elements = []
+        layouts = []
+        # Complete the data types based on user-provided arguments
+        for elt, tens, name in zip([element_A, element_B, element_C, element_D],
+                                   [A, B, C, D],
+                                   ["A", "B", "C", "D"]):
+            if elt is not None and tens is not None:
+                raise Exception(f'Must not specify both element_{name} and tensor {name}')
+            if elt is None and tens is None and element is None:
+                raise Exception(f'Must specify one of element_{name}, tensor {name}, or generic element.')
+            elt_to_set = None
+            lay_to_set = None
+            if tens is not None:
+                elt_to_set, _ = datatypes.get_datatype_and_layout(tens)
+            else:
+                elt_to_set = elt if elt is not None else element
+            assert elt_to_set is not None
+            # Currently we only support layout TensorNHWC
+            lay_to_set = cutlass_cppgen.LayoutType.TensorNHWC
+            elements.append(datatypes.library_type(elt_to_set))
+            layouts.append(lay_to_set)
+        self._element_a, self._element_b, self._element_c, self._element_d = elements
+        self._layout_a, self._layout_b, self._layout_c, self._layout_d = layouts
+        self.A, self.B, self.C, self.D, self.alpha, self.beta = A, B, C, D, alpha, beta
+        if element_accumulator is None:
+            self._element_accumulator = self._element_c
+        else:
+            self._element_accumulator = datatypes.library_type(element_accumulator)
+        # Default inputs if none is supplied in run()
+        self.A = A
+        self.B = B
+        self.C = C
+        self.D = D
+        self.alpha = alpha
+        self.beta = beta
+        # We only specify the stride of the swizzling functor here
+        # The actual swizzling functor is determined in run based on conv_kind and stride
+        self._swizzling_stride = 1
+        # Arguments that will be set to default value in _reset_operations
+        # The default tile_description and op_class are fetched from manifest of cutlass library
+        self._tile_description = None
+        self.op_class = None
+        # The default identity epilogue will be created
+        self.epilogue_functor = None
+        self._reset_operations()
+        # Arguments that will be determined online based on arguments of "run"
+        # based on stride, input/output channels, alignment, and conv_kind
+        self._iterator_algorithm = None
+        self._stride_support = None
+    def _reset_operations(self, reset_epilogue: bool = True):
+        # Set the default op class
+        datatype_comb = (self._element_a, self._element_b, self._element_accumulator)
+        layout_comb = (self._layout_a, self._layout_b)
+        self.possible_op_classes = self.options.supporting_opclasses(
+            self._element_a, self._element_b, self._element_accumulator,
+            self._layout_a, self._layout_b, self._math_operation
+        )
+        if cutlass_cppgen.OpcodeClass.TensorOp in self.possible_op_classes:
+            self.opclass = cutlass_cppgen.OpcodeClass.TensorOp
+        elif cutlass_cppgen.OpcodeClass.Simt in self.possible_op_classes:
+            self.opclass = cutlass_cppgen.OpcodeClass.Simt
+        else:
+            if self._math_operation is not None:
+                math_op_str = f' and math operation {self._math_operation}'
+            else:
+                math_op_str = ''
+            raise Exception(f'No kernel configuration found for supported data type and layout '
+                            f'combination {datatype_comb}x{layout_comb}{math_op_str}')
+        if reset_epilogue:
+            self._reset_epilogue_functor_activation(epilogue.identity)
+        self.alignment_pref_A = min(
+            128 // DataTypeSize[self._element_a], max(self.possible_operations.alignments("A")))
+        self.alignment_pref_B = min(
+            128 // DataTypeSize[self._element_b], max(self.possible_operations.alignments("B")))
+        self.alignment_pref_C = min(
+            128 // DataTypeSize[self._element_c], max(self.possible_operations.alignments("C")))
+    #
+    # Tile description Related
+    #
+    @property
+    def tile_description(self) -> TileDescription:
+        """
+        Returns the tile description
+        """
+        return self._tile_description
+    @tile_description.setter
+    def tile_description(
+        self, td=None):
+        """
+        Set the tile description
+        :param td: tile description
+        :type td: cutlass_cppgen.backend.TileDescription, or a dict with keys
+                  {
+                      "threadblock_shape": [int, int, int],
+                      "warp_count": [int, int, int],
+                      "stages": int,
+                      "instruction_shape": [int, int, int] (optional),
+                      "cluster_shape": [int, int, int] (optional)
+                  }
+        """
+        if td is None:
+            return
+        if isinstance(td, dict):
+            if self._tile_description is None:
+                op = self.possible_operations.default_operation(self._math_operation)
+                self._tile_description = datatypes.td_from_profiler_op(op)
+            if "cluster_shape" in td.keys():
+                if td["cluster_shape"] != [1, 1, 1]:
+                    cutlass_cppgen.logger.warning("Conv2d currently only support 'cluster_shape'=[1, 1, 1]'.")
+                    td["cluster_shape"] = [1, 1, 1]
+            td = self._tile_description.clone_and_update(td)
+        valid, msg = self._valid_tile_description(td)
+        if valid:
+            self._tile_description = td
+        else:
+            raise Exception(msg)
+    def _valid_tile_description(self, td: TileDescription) -> tuple:
+        """
+        Checks whether the provided tile description is valid for the given compute capability. At present,
+        this checks the following:
+        - Does the tile description use a number of stages supported by the compute capability in question?
+        - Does the tile size requested fit within shared memory?
+        - Are cluster dimensions outside the valid range requested for a given architecture (e.g.,
+          more non-unit cluster dimensions for pre-SM90 architectures)?
+        - Is the kernel schedule being used supported on the architecture in question?
+        :param td: tile description to validate
+        :type td: cutlass_cppgen.backend.TileDescription
+        :return: tuple in which the first element is a bool indicating that the tile description is valid
+                 and the second element is a string providing an optional error message.
+        :rtype: tuple
+        """
+        valid, msg = check.valid_stage_count(self.cc, self.current_cc, td)
+        if not valid:
+            return (valid, msg)
+        valid, msg = check.valid_cluster_shape(self.current_cc, td.cluster_shape)
+        if not valid:
+            return (valid, msg)
+        return valid, msg
+    def tile_descriptions(self) -> list:
+        """
+        Returns a list of valid tile descriptions for the operations
+        :returns: list of valid tile descriptions for the operations
+        :rtype: list
+        """
+        descriptions = []
+        description_str = []
+        for op in self.possible_operations.all_operations:
+            td = datatypes.td_from_profiler_op(op)
+            if self._math_operation is not None:
+                if td.math_instruction.math_operation != self._math_operation:
+                    continue
+            if str(td) not in description_str:
+                description_str.append(str(td))
+                descriptions.append(td)
+        return descriptions
+    #
+    # Swizzling functor Related
+    #
+    @property
+    def swizzling_stride(self):
+        """
+        Returns the stride of swizzling currently being used by the Conv2d
+        :return: swizzing stride
+        """
+        return self._swizzling_stride
+    @swizzling_stride.setter
+    def swizzling_stride(self, stride: int):
+        """
+        Sets the swizzling functor to the type specified by `swizzling_functor`
+        """
+        if not isinstance(stride, int):
+            raise Exception(f"Expect integer (1, 2, 4, 8), got {stride}")
+        self._swizzling_stride = stride
+    def _propose_swizzling_functor(self, stride):
+        """
+        Automatically propose the swizzling functor based on the stride
+        """
+        if self.conv_kind == ConvKind.Dgrad:
+            if stride[0] != 1 or stride[1] != 1:
+                return getattr(cutlass_cppgen.swizzle, f"StridedDgradIdentitySwizzle{self._swizzling_stride}")
+        return getattr(cutlass_cppgen.swizzle, f"IdentitySwizzle{self._swizzling_stride}")
+    #
+    # Iterator Algorithm Related
+    #
+    @property
+    def iterator_algorithm(self) -> IteratorAlgorithm:
+        """
+        Returns the iterator algorithm
+        """
+        return self._iterator_algorithm
+    @iterator_algorithm.setter
+    def iterator_algorithm(self, alg: str):
+        """
+        Sets the iterator algorithm
+        :param alg: The iterator algorithm
+        :type td: string, options: "analytic", "optimized", "few_channels", and "fixed_channels"
+        """
+        iterator_alg = datatypes.getattr_enum(IteratorAlgorithm, alg)
+        # Check if the iterator algorithm is valid
+        if iterator_alg in [IteratorAlgorithm.FewChannels, IteratorAlgorithm.FixedChannels] and self.conv_kind != ConvKind.Fprop:
+            raise Exception(f"{self.conv_kind} does not support iterator algorithm {alg}.")
+        self._iterator_algorithm = iterator_alg
+    def _propose_iterator_algorithm(self, problem_size, alignment_a, alignment_b) -> IteratorAlgorithm:
+        """
+        Propose a valid iterator algorithm based on problem size and alignment
+        """
+        if self.conv_kind == ConvKind.Fprop:
+            # Check whether the fixed channel is applicable
+            if problem_size.C == alignment_a:
+                return IteratorAlgorithm.FixedChannels
+            elif (problem_size.C % alignment_a == 0 and
+                  problem_size.R <= 32 and problem_size.S <= 32):
+                return IteratorAlgorithm.Optimized
+            else:
+                return IteratorAlgorithm.Analytic
+        elif self.conv_kind == ConvKind.Dgrad:
+            if (problem_size.K % alignment_a == 0 and
+                problem_size.R <= 32 and problem_size.S <= 32 and
+                problem_size.C % alignment_b == 0):
+                return IteratorAlgorithm.Optimized
+            else:
+                return IteratorAlgorithm.Analytic
+        elif self.conv_kind == ConvKind.Wgrad:
+            if (problem_size.K % alignment_a == 0 and
+                problem_size.C % alignment_b == 0):
+                return IteratorAlgorithm.Optimized
+            else:
+                return IteratorAlgorithm.Analytic
+    def _validate_iterator_algorithm(self, iterator_algorithm, problem_size, alignment_a, alignment_b) -> bool:
+        """
+        Validate whether the user provide iterator algorithm works for the given problem size
+        """
+        if self.conv_kind == ConvKind.Fprop:
+            if iterator_algorithm == IteratorAlgorithm.FixedChannels:
+                return problem_size.C == alignment_a
+            elif iterator_algorithm == IteratorAlgorithm.Optimized:
+                return (problem_size.C % alignment_a == 0 and
+                  problem_size.R <= 32 and problem_size.S <= 32)
+            elif iterator_algorithm == IteratorAlgorithm.FewChannels:
+                return problem_size.C % alignment_a == 0
+        elif self.conv_kind == ConvKind.Dgrad:
+            if iterator_algorithm == IteratorAlgorithm.Optimized:
+                return (problem_size.K % alignment_a == 0 and
+                        problem_size.R <= 32 and problem_size.S <= 32 and
+                        problem_size.C % alignment_b == 0)
+        elif self.conv_kind == ConvKind.Wgrad:
+            if iterator_algorithm == IteratorAlgorithm.Optimized:
+                return (problem_size.K % alignment_a == 0 and
+                problem_size.C % alignment_b == 0)
+        return True
+    #
+    # Stride Support Related
+    #
+    def _propose_stride_support(self, stride):
+        if self.conv_kind == ConvKind.Dgrad:
+            if stride[0] == 1 and stride[1] == 1:
+                return StrideSupport.Unity
+        return StrideSupport.Strided
+    #
+    # Construct and Compilation
+    #
+    def construct(
+        self, tile_description: TileDescription = None,
+        alignment_A: int = None, alignment_B: int = None, alignment_C: int = None,
+        iterator_algorithm: IteratorAlgorithm = None,
+        stride_support = None, swizzling_functor: cutlass_cppgen.swizzle = None,
+        epilogue_functor=None) -> cutlass_cppgen.backend.Conv2dOperation:
+        """
+        Constructs a ``cutlass_cppgen.backend.Conv2dOperation`` based on the input parameters and current
+        kernel specification of the ``Conv2d`` object.
+        :param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass_cppgen.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+        :param iterator_algorithm: the iterator algorithm used
+        :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
+        :param stride_support: the stride support of dgrad
+        :type stride_support: cutlass_library.library.StrideSupport
+        :param swizzling_functor: the swizzling functor
+        :type swizzling_functor: cutlass_cppgen.swizzle
+        :param epilogue_functor: the epilogue functor
+        :return: operation that was constructed
+        :rtype: cutlass_cppgen.backend.Conv2dOperation
+        """
+        # Get alignment
+        alignment_A = check.alignment_or_default(alignment_A, self.alignment_pref_A)
+        alignment_B = check.alignment_or_default(alignment_B, self.alignment_pref_B)
+        alignment_C = check.alignment_or_default(alignment_C, self.alignment_pref_C)
+        tensor_A = TensorDescription(self._element_a, self._layout_b, alignment_A)
+        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
+        if tile_description is None:
+            if self.tile_description is not None:
+                tile_description = self.tile_description
+            else:
+                op = self.possible_operations.operations(alignment_A, alignment_B, alignment_C, self._math_operation)[0]
+                tile_description = datatypes.td_from_profiler_op(op)
+        else:
+            valid, err_str = self._valid_tile_description(tile_description)
+            if not valid:
+                raise Exception(f"Invalid tile description. {err_str}")
+            self.tile_description = tile_description
+        if iterator_algorithm is None:
+            # If the iterator algorithm is already set
+            if self.iterator_algorithm is not None:
+                iterator_algorithm = self.iterator_algorithm
+            else:
+                # Otherwise, we conservatively use the analytic iterator for correctness
+                iterator_algorithm = IteratorAlgorithm.Analytic
+        if stride_support is None:
+            # If the stride support is already set
+            if self._stride_support is not None:
+                stride_support = self._stride_support
+            else:
+                # Otherwise, we assume strided
+                stride_support = StrideSupport.Strided
+        if swizzling_functor is None:
+            # If the swizzling functor is already set
+            swizzling_functor = self._propose_swizzling_functor(stride=(2, 2))
+        if epilogue_functor is None:
+            if self.epilogue_functor is not None:
+                epilogue_functor = self.epilogue_functor
+            else:
+                epilogue_functor = self._create_epilogue_functor_activation(self._activation)
+        # Reset the alignment of the epilogue functor
+        epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, epilogue_functor)
+        operation = Conv2dOperation(
+            conv_kind=self.conv_kind,
+            iterator_algorithm=iterator_algorithm,
+            arch=self.current_cc,
+            tile_description=tile_description,
+            A=tensor_A, B=tensor_B, C=tensor_C,
+            stride_support=stride_support,
+            epilogue_functor=epilogue_functor,
+            swizzling_functor=swizzling_functor,
+        )
+        return operation
+    def compile(self, tile_description: TileDescription = None,
+                alignment_A: int = None, alignment_B: int = None, alignment_C: int = None,
+                iterator_algorithm: IteratorAlgorithm = None,
+                stride_support = None, swizzling_functor: cutlass_cppgen.swizzle = None,
+                epilogue_functor = None, print_module: bool = False) -> cutlass_cppgen.backend.Conv2dOperation:
+        """
+        Emits and compiles the kernel currently specified. If ``tile_description`` and any
+        of the ``alignment`` parameters are set, the kernel will be chosen using this
+        tile description and alignments. Otherwise, a default tile description and alignment
+        will be used.
+        ::param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass_cppgen.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+        :param iterator_algorithm: the iterator algorithm used
+        :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
+        :param stride_support: the stride support of dgrad
+        :type stride_support: cutlass_library.library.StrideSupport
+        :param swizzling_functor: the swizzling functor
+        :type swizzling_functor: cutlass_cppgen.swizzle
+        :param epilogue_functor: the epilogue functor
+        :return: operation that was compiled
+        :rtype: cutlass_cppgen.backend.Conv2dOperation
+        """
+        self.operation = self.construct(
+            tile_description, alignment_A, alignment_B, alignment_C,
+            iterator_algorithm, stride_support, swizzling_functor, epilogue_functor)
+        if print_module:
+            print(self.operation.rt_module.emit())
+        compiler.add_module([self.operation,])
+        return self.operation
+    #
+    # Run Related
+    #
+    def _verify_type_and_layout(self, tensor, ref_type, ref_layout, name):
+        """
+        Verifies that ``tensor`` has data type ``ref_type`` and layout ``ref_layout``. An exception
+        is raised if it does not.
+        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type tensor: numpy/cupy/torch array/tensor object
+        :param ref_dtype: data type for the tensor that this object was initialized to
+        :param name: identifier of the tensor to verify. Used in raising exceptions
+        :type name: str
+        """
+        dtype, _ = datatypes.get_datatype_and_layout(tensor)
+        if dtype != ref_type:
+            raise Exception(f'Tensor {name} with type and layout {dtype} '
+                            f'does not match the expected type of {ref_type}.')
+    def _get_and_verify_conv_problem_size(self, A, B, C, stride, padding, dilation):
+        if self.conv_kind == ConvKind.Fprop:
+            input = A
+            weight = B
+            output = C
+            output_tensor = "C"
+        elif self.conv_kind == ConvKind.Dgrad:
+            output = A
+            weight = B
+            input = C
+            output_tensor = "A"
+        elif self.conv_kind == ConvKind.Wgrad:
+            output = A
+            input = B
+            weight = C
+            output_tensor = "A"
+        else:
+            raise Exception(f"Convolution kind {self.conv_kind} is not supported")
+        N_, H_, W_, C_ = datatypes.get_tensor_shape(input, op="CONV")
+        K_, R_, S_, _ = datatypes.get_tensor_shape(weight, op="CONV")
+        _, P_, Q_, _ = datatypes.get_tensor_shape(output, op="CONV")
+        problem_size = Conv2DProblemSize(
+            N_, H_, W_, C_,
+            K_, R_, S_, C_,
+            padding[0], padding[1],
+            stride[0], stride[1],
+            dilation[0], dilation[1],
+            ConvMode.CrossCorrelation,
+            1, 1
+        )
+        if P_ != problem_size.P or Q_ != problem_size.Q:
+            raise Exception(
+                f"Tensor {output_tensor} size should be ({N_}, {problem_size.P}, {problem_size.Q}, {K_}), got ({N_}, {P_}, {Q_}, {K_})")
+        return problem_size
+    def run(self, A=None, B=None, C=None, D=None,
+            stride=(1, 1), padding=(0, 0), dilation=(1, 1),
+            alpha=None, beta=None,
+            split_k=("serial", 1), sync: bool = True,
+            print_module: bool = False,
+            stream: Optional[cuda.CUstream] = None) -> Conv2dArguments:
+        """
+        Runs the kernel currently specified. If it has not already been, the kernel is emitted and
+        compiled. Tensors holding operands and outputs of the kernel are sourced either from the
+        ``A``, ``B``, ``C``, ``D``, ``alpha``, and ``beta``
+        parameters provided in the call, or from those
+        passed in on the construction of this object -- one of the two must be specified.
+        By default, this call returns only once the kernel has completed. To launch the kernel
+        and immediately return, set ``sync=False``. In this case, it is the responsibility of the
+        caller to syncrhonize the results of the kernel before attempting to access outputs
+        by calling ``sync()`` on the arguments returned from this call.
+        :param A: tensor representing data type and layout of operand A
+        :param B: tensor representing data type and layout of operand B
+        :param C: tensor representing data type and layout of operand C
+        :param D: tensor representing data type and layout of operand D
+        :param stride: (stride_h, stride_w) describing the convolution stride. Default: (1, 1)
+        :param padding: (pad_h, pad_w) describing the convolution padding. Default: (0, 0)
+        :param dilation: (dilation_h, dilation_w) describing the dilation of convolution. Default: (1, 1)
+        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+        :param beta: scalar parameter beta from GEMM operation that scales operand C
+        :param split_k: a tuple (split_k_mode, split_k_slices)
+        :param sync: whether the call should wait for the kernel to complete before returning
+        :type sync: bool
+        :param print_module: whether to print the emitted C++ code
+        :type print_module: bool
+        :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+        :type stream: :class:`cuda.cuda.CUstream`
+        :return: arguments passed in to the kernel
+        :rtype: cutlass_cppgen.backend.Conv2dArguments
+        """
+        if not stream:
+            stream = cuda.CUstream(0)
+        super().run_setup()
+        A = self._verify_tensor(A, self.A, self._element_a, self._layout_a, "A")
+        B = self._verify_tensor(B, self.B, self._element_b, self._layout_b, "B")
+        C = self._verify_tensor(C, self.C, self._element_c, self._layout_c, "C")
+        D = self._verify_tensor(D, self.D, self._element_d, self._layout_d, "D")
+        alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
+        beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")
+        # handle the case when there is no C
+        if C is None:
+            if beta != 0:
+                raise Exception(f"With beta {beta} != 0, C has to be provided.")
+            else:
+                C = D
+        # Construct problem size based on input
+        # It also verifies whether the A, B, C, D, stride, padding, and dilation are matching
+        problem_size = self._get_and_verify_conv_problem_size(A, B, C, stride, padding, dilation)
+        # Propose stride support based on input
+        stride_support = self._propose_stride_support(stride)
+        # Propose swizzling functor
+        swizzling_functor = self._propose_swizzling_functor(stride)
+        shape_a = datatypes.get_tensor_shape(A, op="CONV")
+        shape_b = datatypes.get_tensor_shape(B, op="CONV")
+        shape_c = datatypes.get_tensor_shape(C, op="CONV")
+        # Get the alignment
+        alignment_a = self.possible_operations.find_alignment(shape_a, self._layout_a, operand="A")
+        alignment_b = self.possible_operations.find_alignment(shape_b, self._layout_b, operand="B")
+        alignment_c = self.possible_operations.find_alignment(shape_c, self._layout_c, operand="C")
+        alignment_a = check.update_alignment(alignment_a, self.alignment_pref_A)
+        alignment_b = check.update_alignment(alignment_b, self.alignment_pref_B)
+        alignment_c = check.update_alignment(alignment_c, self.alignment_pref_C)
+        # Propose iterator algorithm based on input
+        if self._iterator_algorithm is None:
+            # Propose a default iterator algorithm based on the problem size
+            iterator_algorithm = self._propose_iterator_algorithm(problem_size, alignment_a, alignment_b)
+        else:
+            if (self._validate_iterator_algorithm(self._iterator_algorithm, problem_size, alignment_a, alignment_b)):
+                iterator_algorithm = self._iterator_algorithm
+            else:
+                raise Exception(f"Iterator algorithm {self._iterator_algorithm} is invalid for current problem.")
+        epilogue_args = [alpha, beta]
+        if hasattr(self, "_activation_args"):
+            if isinstance(self._activation_args, list):
+                epilogue_args += self._activation_args
+            else:
+                epilogue_args.append(self._activation_args)
+        if split_k[0] == "parallel" and split_k[1] > 1:
+            epilogue_functor = self._create_epilogue_functor_activation(epilogue.identity)
+        else:
+            epilogue_functor = self.epilogue_functor
+        # The alignment is determined by the iterator function (I believe)
+        self.compile(tile_description=self.tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
+                     alignment_C=alignment_c, iterator_algorithm=iterator_algorithm, stride_support=stride_support,
+                     swizzling_functor=swizzling_functor, epilogue_functor=epilogue_functor, print_module=print_module)
+        # Create reduction operation for parallel split-k
+        if split_k[0] == "parallel" and split_k[1] > 1:
+            epilogue_functor_reduction = self._reset_epilogue_functor_alignment(alignment_c, self.epilogue_functor)
+            self.reduction_operation = ReductionOperation(
+                shape=MatrixCoord(4, 32 * alignment_c), C=self.operation.C,
+                element_accumulator=self._element_accumulator,
+                element_compute=self._element_accumulator,
+                epilogue_functor=epilogue_functor_reduction,
+                count=alignment_c
+            )
+            if print_module:
+                print(self.reduction_operation.rt_module.emit())
+            compiler.add_module([self.reduction_operation,])
+        arguments = Conv2dArguments(
+            operation=self.operation, problem_size=problem_size,
+            A=A, B=B, C=C, D=D,
+            output_op=self.operation.epilogue_type(*epilogue_args),
+            split_k_mode=datatypes.getattr_enum(SplitKMode, split_k[0]),
+            split_k_slices=split_k[1],
+            stream=stream
+        )
+        self.operation.run(arguments)
+        if split_k[0] == "parallel" and split_k[1] > 1:
+            implicit_gemm_size = arguments.problem_size.implicit_gemm_size(self.conv_kind)
+            reduction_arguments = ReductionArguments(
+                self.reduction_operation,
+                problem_size=[implicit_gemm_size.m, implicit_gemm_size.n],
+                partitions=split_k[1],
+                workspace=arguments.ptr_D,
+                destination=D,
+                source=C,
+                output_op=self.reduction_operation.epilogue_type(*epilogue_args),
+                stream=stream
+            )
+            self.reduction_operation.run(reduction_arguments)
+        if sync:
+            if split_k[0] == "parallel" and split_k[1] > 1:
+                reduction_arguments.sync()
+                # Free memory allocated by args because we are not
+                # calling `arguments.sync()` in this case (which will free memory)
+                arguments.free()
+            else:
+                arguments.sync()
+        return arguments
+    #
+    # Helper functions
+    #
+    @staticmethod
+    def output_size(input_size, weight_size, padding, stride, dilation):
+        problem_size = Conv2DProblemSize(
+            *input_size,
+            *weight_size,
+            padding[0], padding[1],
+            stride[0], stride[1],
+            dilation[0], dilation[1],
+            ConvMode.CrossCorrelation,
+            1, 1
+        )
+        return (problem_size.N, problem_size.P, problem_size.Q, problem_size.K)
+#
+# Easy to use interfaces for fprop, wgrad, and dgrad
+#
+class Conv2dFprop(Conv2d):
+    def __init__(
+        self,
+        input=None, weight=None, C=None, output=None, alpha=1, beta=0,
+        element=None,
+        element_input=None, element_weight=None, element_C=None, element_output=None,
+        element_accumulator=None,
+        cc: int = None, kernel_cc: int = None):
+        A, B, D = input, weight, output
+        element_A, element_B, element_D = element_input, element_weight, element_output
+        super().__init__(
+            "fprop", A, B, C, D, alpha, beta, element,
+            element_A, element_B, element_C, element_D,
+            element_accumulator, cc, kernel_cc)
+    def run(
+        self, input=None, weight=None, C=None, output=None, alpha=None, beta=None,
+        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
+        sync: bool = True, print_module: bool = False,
+        stream: Optional[cuda.CUstream] = None) -> Conv2dArguments:
+        if not stream:
+            stream = cuda.CUstream(0)
+        A, B, D = input, weight, output
+        return super().run(
+            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream)
+class Conv2dDgrad(Conv2d):
+    def __init__(
+        self,
+        grad_output=None, weight=None, C=None, grad_input=None, alpha=1, beta=0,
+        element=None,
+        element_grad_output=None, element_weight=None, element_C=None, element_grad_input=None,
+        element_accumulator=None,
+        cc: int = None, kernel_cc: int = None):
+        A, B, D = grad_output, weight, grad_input
+        element_A, element_B, element_D = element_grad_output, element_weight, element_grad_input
+        super().__init__(
+            "dgrad", A, B, C, D, alpha, beta, element,
+            element_A, element_B, element_C, element_D,
+            element_accumulator, cc, kernel_cc)
+    def run(self, grad_output=None, weight=None, C=None, grad_input=None, alpha=None, beta=None,
+        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
+        sync: bool = True, print_module: bool = False,
+        stream: Optional[cuda.CUstream] = None) -> Conv2dArguments:
+        #
+        if not stream:
+            stream = cuda.CUstream(0)
+        A, B, D = grad_output, weight, grad_input
+        return super().run(
+            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream)
+class Conv2dWgrad(Conv2d):
+    def __init__(
+        self,
+        grad_output=None, input=None, C=None, grad_weight=None, alpha=1, beta=0,
+        element=None,
+        element_grad_output=None, element_input=None, element_C=None, element_grad_weight=None,
+        element_accumulator=None,
+        cc: int = None, kernel_cc: int = None):
+        A, B, D = grad_output, input, grad_weight
+        element_A, element_B, element_D = element_grad_output, element_input, element_grad_weight
+        super().__init__(
+            "wgrad", A, B, C, D, alpha, beta, element,
+            element_A, element_B, element_C, element_D,
+            element_accumulator, cc, kernel_cc)
+    def run(self, grad_output=None, input=None, C=None, grad_weight=None, alpha=None, beta=None,
+        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
+        sync: bool = True, print_module: bool = False,
+        stream: Optional[cuda.CUstream] = None) -> Conv2dArguments:
+        if not stream:
+            stream = cuda.CUstream(0)
+        A, B, D = grad_output, input, grad_weight
+        return super().run(
+            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream)

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/gemm.py ADDED Viewed

	@@ -0,0 +1,725 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+    Ease-of-use interface for constructing, compiling, and running GEMMs.
+    The ``Gemm`` interface is meant to allow one to easily instantiate, compile, and run
+    GEMM operations in CUTLASS via Python, without specifying many configuration parameters.
+    Under the hood, the interface will select sensible default parameters for the many template
+    parameters for CUTLASS GEMMs.
+    Note: optimal performance is not to be expected from this interface. To achieve optimal
+    performance, one should specify and tune each configuration parameter.
+    The simplest example of using this interface is the following:
+    .. highlight:: python
+    .. code-block:: python
+        # A, B, C, and D are torch/numpy/cupy tensor objects
+        plan = cutlass_cppgen.op.Gemm(A, B, C, D)
+        plan.run()
+    One can also use the interface by specifying data types of operands at construction
+    and using different tensor objects with these data types at runtime:
+    .. highlight:: python
+    .. code-block:: python
+        # The following is shorthand for:
+        #        cutlass_cppgen.op.Gemm(element_A=torch.float32, element_B=torch.float32,
+        #                        element_C=torch.float32, element_D=torch.float32,
+        #                        element_accumulator=torch.float32,
+        #                        layout=cutlass_cppgen.LayoutType.RowMajor)
+        plan = cutlass_cppgen.op.Gemm(element=torch.float32, layout=cutlass_cppgen.LayoutType.RowMajor)
+        A0 = torch.rand((128, 256), device='cuda')
+        B0 = torch.rand((256, 64), device='cuda')
+        C0 = torch.zeros((128, 64), device='cuda')
+        D0 = torch.zeros((128, 64), device.'cuda')
+        plan.run(A0, B0, C0, D0)
+        A = torch.rand((32, 128), device='cuda')
+        B = torch.rand((128, 256), device='cuda')
+        C = torch.zeros((32, 256), device='cuda')
+        D = torch.zeros((32, 256), device.'cuda')
+        plan.run(A1, B1, C1, D1)
+    The interface additionally enables one to decouple the compilation of the underlying CUTLASS
+    kernel from its execution:
+    .. highlight:: python
+    .. code-block:: python
+        plan = cutlass_cppgen.op.Gemm(element=np.float32, layout=cutlass_cppgen.LayoutType.RowMajor)
+        plan.compile()
+        # Do other work...
+        plan.run(A0, B0, C0, D0)
+        # Do other work...
+        plan.run(A1, B1, C1, D1)
+    Elementwise activation functions are easily fused to the GEMM via the interface:
+    .. highlight:: python
+    .. code-block:: python
+        plan = cutlass_cppgen.op.Gemm(element=np.float32, layout=cutlass_cppgen.LayoutType.RowMajor)
+        plan.activation = cutlass_cppgen.epilogue.relu
+    Operations can also be run asynchronously:
+    .. highlight:: python
+    .. code-block:: python
+        plan = cutlass_cppgen.op.Gemm(element=np.float32, layout=cutlass_cppgen.LayoutType.RowMajor)
+        args = plan.run()
+        # Do other work...
+        args.sync()
+"""
+from __future__ import annotations
+from typing import Optional
+from math import prod
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+from cutlass_library import (
+    DataType,
+    DataTypeSize,
+    GemmUniversalMode,
+    KernelScheduleSuffixes,
+)
+import cutlass_cppgen
+from cutlass_cppgen import epilogue, swizzle
+from cutlass_cppgen.backend import compiler
+from cutlass_cppgen.backend.evt import EpilogueFunctorVisitor
+from cutlass_cppgen.backend.gemm_operation import GemmArguments, GemmOperationUniversal
+from cutlass_cppgen.backend.library import TensorDescription, TileDescription
+from cutlass_cppgen.op.op import OperationBase
+from cutlass_cppgen.shape import GemmCoord
+from cutlass_cppgen.utils import check, datatypes
+class Gemm(OperationBase):
+    """
+    Constructs a ``Gemm`` object.
+    The data types and layouts of operands A, B, and C, along with the data type of output D
+    and that used for accumulation, are bound to the ``Gemm`` object throughout its lifetime --
+    these are not to be changed after a ``Gemm`` has been constructed.
+    The constructor has optional parameters for flexibly setting these parameters. The following
+    constructors are equivalent:
+    .. highlight:: python
+    .. code-block:: python
+        # Use F32 for A, B, C, D, and accumulation. All operands are row major.
+        # Use the generic ``element`` and ``layout`` parameters to concisely set all data types and layouts
+        # for operands to the same values.
+        Gemm(element=cutlass_cppgen.DataType.f32, layout=cutlass_cppgen.LayoutType.RowMajor)
+        # Explicitly specify the data types to use for A, B, C, and D. Use the generic ``layout``.
+        Gemm(element_A=cutlass_cppgen.DataType.f32, element_B=cutlass_cppgen.DataType.f32, element_C=cutlass_cppgen.DataType.f32,
+            element_D=cutlass_cppgen.DataType.f32, layout=cutlass_cppgen.LayoutType.RowMajor)
+        # Set the data types and elements from existing tensors. Note that one can use different tensors when
+        # executing GEMM via the ``run()`` method than passed in here (though those passed in to ``run()`` must
+        # have the same data type and layout as those passed in here).
+        # A, B, C, and D are row-major torch.Tensor objects of type torch.float32
+        Gemm(A=A, B=B, C=C, D=D)
+        # Use the generic ``element`` and explicitly specify the layouts to use for A, B, and C (layout of D is
+        # the same as that for D, at present)
+        Gemm(element=cutlass_cppgen.DataType.f32, layout_A=cutlass_cppgen.LayoutType.RowMajor,
+            layout_B=cutlass_cppgen.LayoutType.RowMajor, layout_C=cutlass_cppgen.LayoutType.RowMajor)
+        # Explicitly specify the data type and layout for only some of A, B, C, and D. Unspecified data types
+        # and layouts will inherit those passed in via the generic ``element`` and ``layout``
+        Gemm(element_A=cutlass_cppgen.DataType.f32, layout_B=cutlass_cppgen.LayoutType.RowMajor,
+            element=cutlass_cppgen.DataType.f32, layout=cutlass_cppgen.LayoutType.RowMajor)
+    The order of precedence for the setting of the data type and layout for a given operand/output is as follows:
+        1) If the tensor type is specified (e.g., ``A``), use the data type and layout inferred from this tensor
+        2) Otherwise, if the data type/layout (e.g., ``element_A``, ``layout_A``) is specified, use those
+        3) Otherwise, use the generic values (e.g., ``element``, ``layout``)
+    :param cc: compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90
+    :type cc: int
+    :param kernel_cc: compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80
+    :type kernel_cc: int
+    :param A: tensor representing data type and layout of operand A
+    :param B: tensor representing data type and layout of operand B
+    :param C: tensor representing data type and layout of operand C
+    :param D: tensor representing data type and layout of operand D
+    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+    :param beta: scalar parameter beta from GEMM operation that scales operand C
+    :param element_accumulator: data type to be used in accumulation of the product of operands A and B
+    :type element_accumulator: cutlass_cppgen.DataType
+    :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type
+    :type element: cutlass_cppgen.DataType
+    :param layout: generic layout type to be used for operands A, B, C, and D
+    :type layout: cutlass_cppgen.LayoutType
+    :param element_A: data type to be used for operand A
+    :type element_A: cutlass_cppgen.DataType
+    :param element_B: data type to be used for operand B
+    :type element_B: cutlass_cppgen.DataType
+    :param element_C: data type to be used for operand C
+    :type element_C: cutlass_cppgen.DataType
+    :param element_D: data type to be used for operand D
+    :type element_D: cutlass_cppgen.DataType
+    :param layout_A: layout of operand A
+    :type layout_A: cutlass_cppgen.LayoutType
+    :param layout_B: layout of operand B
+    :type layout_B: cutlass_cppgen.LayoutType
+    :param layout_C: layout of operand C
+    :type layout_C: cutlass_cppgen.LayoutType
+    :param layout_D: layout of operand D
+    :type layout_D: cutlass_cppgen.LayoutType
+    """
+    def __init__(
+        self, A=None, B=None, C=None, D=None,
+        alpha=1.0, beta=0.0, element_accumulator=None,
+        element=None, layout=None,
+        element_A=None, element_B=None, element_C=None, element_D=None,
+        layout_A=None, layout_B=None, layout_C=None,
+        cc: int = None, kernel_cc: int = None
+    ):
+        super().__init__(cc=cc, kernel_cc=kernel_cc)
+        self.name = "gemm"
+        self.compiled = False
+        elements = []
+        layouts = []
+        # Check that at least one of the following is set for each tensor (illustrated assuming tensor A):
+        # ``A``, ``element_A``, ``element`` and ``A``, ``layout_A``, ``layout``
+        for elt, lay, tens, name in zip([element_A, element_B, element_C, element_D],
+                                        [layout_A, layout_B, layout_C, layout_C],
+                                        [A, B, C, D],
+                                        ["A", "B", "C", "D"]):
+            if elt is not None and tens is not None:
+                raise Exception(f'Must not specify both element_{name} and tensor {name}')
+            if lay is not None and tens is not None:
+                raise Exception(f'Must not specify both layout_{name} and tensor {name}')
+            if elt is None and tens is None and element is None:
+                raise Exception(f'Must specify one of element_{name}, tensor {name}, or generic element.')
+            if lay is None and tens is None and layout is None:
+                raise Exception(f'Must specify one of layout_{name}, tensor {name}, or generic layout.')
+            elt_to_set = None
+            lay_to_set = None
+            if tens is not None:
+                elt_to_set, lay_to_set = datatypes.get_datatype_and_layout(tens)
+            else:
+                elt_to_set = elt if elt is not None else element
+                lay_to_set = lay if lay is not None else layout
+            elements.append(datatypes.library_type(elt_to_set))
+            layouts.append(lay_to_set)
+        self._element_a, self._element_b, self._element_c, self._element_d = elements
+        self._layout_a, self._layout_b, self._layout_c, self._layout_d = layouts
+        if element_accumulator is None:
+            self._element_accumulator = self._element_c
+        else:
+            self._element_accumulator = datatypes.library_type(element_accumulator)
+        self.A = A
+        self.B = B
+        self.C = C
+        self.D = D
+        self.alpha = alpha
+        self.beta = beta
+        self.epilogue_functor = None
+        self.op_class = None
+        self._tile_description = None
+        self._reset_operations()
+        self._swizzling_functor = cutlass_cppgen.swizzle.IdentitySwizzle1
+    def _reset_operations(self, reset_epilogue: bool = True):
+        # Set the default op class
+        datatype_comb = (self._element_a, self._element_b, self._element_accumulator)
+        layout_comb = (self._layout_a, self._layout_b)
+        self.possible_op_classes = self.options.supporting_opclasses(
+            self._element_a, self._element_b, self._element_accumulator,
+            self._layout_a, self._layout_b, self._math_operation)
+        if cutlass_cppgen.OpcodeClass.TensorOp in self.possible_op_classes:
+            self.opclass = cutlass_cppgen.OpcodeClass.TensorOp
+        elif cutlass_cppgen.OpcodeClass.Simt in self.possible_op_classes:
+            self.opclass = cutlass_cppgen.OpcodeClass.Simt
+        else:
+            if self._math_operation is not None:
+                math_op_str = f' and math operation {self._math_operation}'
+            else:
+                math_op_str = ''
+            raise Exception(f'No kernel configuration found for supported data type and layout '
+                            f'combination {datatype_comb}x{layout_comb}{math_op_str}')
+        if reset_epilogue:
+            self._reset_epilogue_functor_activation(cutlass_cppgen.epilogue.identity)
+    @property
+    def swizzling_functor(self):
+        """
+        Returns the type of the swizzling functor currently being used by the GEMM
+        :return: swizzing functor type
+        """
+        return self._swizzling_functor
+    @swizzling_functor.setter
+    def swizzling_functor(self, swizzling_functor):
+        """
+        Sets the swizzling functor to the type specified by `swizzling_functor`
+        """
+        if swizzling_functor == cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK:
+            if self.op_class == cutlass_cppgen.OpcodeClass.Simt:
+                raise Exception('ThreadblockSwizzleStreamK is currently only supported with opcode class TensorOp')
+            if self.current_cc in [90, 100, 101, 103]:
+                raise Exception('ThreadblockSwizzleStreamK is currently unsupported on SM90+')
+        self._swizzling_functor = swizzling_functor
+    #
+    # Tile description Related
+    #
+    @property
+    def tile_description(self) -> TileDescription:
+        """
+        Returns the tile description
+        """
+        return self._tile_description
+    @tile_description.setter
+    def tile_description(
+        self, td=None):
+        """
+        Set the tile description
+        :param td: tile description
+        :type td: cutlass_cppgen.backend.TileDescription, or a dict with keys
+                  {
+                      "threadblock_shape": [int, int, int],
+                      "warp_count": [int, int, int],
+                      "stages": int,
+                      "instruction_shape": [int, int, int] (optional),
+                      "cluster_shape": [int, int, int] (optional)
+                  }
+        """
+        if td is None:
+            return
+        if isinstance(td, dict):
+            if self._tile_description is None:
+                op = self.possible_operations.default_operation(self._math_operation)
+                self._tile_description = datatypes.td_from_profiler_op(op)
+            td = self._tile_description.clone_and_update(td)
+        valid, msg = self._valid_tile_description(td)
+        if valid:
+            self._tile_description = td
+        else:
+            raise Exception(msg)
+    def _valid_tile_description(self, td: TileDescription) -> tuple:
+        """
+        Checks whether the provided tile description is valid for the given compute capability. At present,
+        this checks the following:
+        - Does the tile description use a number of stages supported by the compute capability in question?
+        - Does the tile size requested fit within shared memory?
+        - Are cluster dimensions outside the valid range requested for a given architecture (e.g.,
+          more non-unit cluster dimensions for pre-SM90 architectures)?
+        - Is the kernel schedule being used supported on the architecture in question?
+        :param td: tile description to validate
+        :type td: cutlass_cppgen.backend.TileDescription
+        :return: tuple in which the first element is a bool indicating that the tile description is valid
+                 and the second element is a string providing an optional error message.
+        :rtype: tuple
+        """
+        valid, msg = check.valid_stage_count(self.cc, self.current_cc, td, self._element_c, self._element_d)
+        if not valid:
+            return (valid, msg)
+        valid, msg = check.valid_cluster_shape(self.current_cc, td.cluster_shape)
+        if not valid:
+            return (valid, msg)
+        valid, msg = check.valid_schedule(self.current_cc, td.kernel_schedule, td.epilogue_schedule, td.tile_scheduler)
+        if self.cc in [100, 101, 103] and td.kernel_schedule is not None and td.is_2sm and td.cluster_shape[0] % 2 != 0:
+            valid = False
+            msg = "Cluster shape must be divisible by 2 for 2SM kernels on SM100, SM101, and SM103"
+        return valid, msg
+    def tile_descriptions(self) -> list:
+        """
+        Returns a list of valid tile descriptions for the operations
+        :returns: list of valid tile descriptions for the operations
+        :rtype: list
+        """
+        tds = [datatypes.td_from_profiler_op(op) for op in self.possible_operations.all_operations]
+        if self._math_operation is not None:
+            tds = [td for td in tds if td.math_instruction.math_operation == self._math_operation]
+        return tds
+    def construct(
+        self, tile_description: TileDescription = None,
+        alignment_A: int = None, alignment_B: int = None, alignment_C: int = None) -> GemmOperationUniversal:
+        """
+        Constructs a ``cutlass_cppgen.backend.GemmUniversalOperation`` based on the input parameters and current
+        kernel specification of the ``Gemm`` object.
+        :param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass_cppgen.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+        :return: operation that was constructed
+        :rtype: cutlass_cppgen.backend.GemmOperationUniversal
+        """
+        alignment_pref_A = min(128 // DataTypeSize[self._element_a], max(self.possible_operations.alignments("A")))
+        alignment_pref_B = min(128 // DataTypeSize[self._element_b], max(self.possible_operations.alignments("B")))
+        alignment_A = check.alignment_or_default(alignment_A, alignment_pref_A)
+        alignment_B = check.alignment_or_default(alignment_B, alignment_pref_B)
+        tensor_A = TensorDescription(self._element_a, self._layout_a, alignment_A)
+        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+        if alignment_C is None:
+            alignment_C = max(self.possible_operations.alignments("C"))
+            if self._element_c != DataType.void:
+                alignment_C = min(128 // DataTypeSize[self._element_c], alignment_C)
+        if tile_description is None:
+            if self._tile_description is None:
+                op = self.possible_operations.operations(alignment_A, alignment_B, alignment_C, self._math_operation)[0]
+                tile_description = datatypes.td_from_profiler_op(op)
+                # The selected op may have lower alignment than that determined above, so we must
+                # reset alignment here.
+                alignment_C = op.C.alignment
+            else:
+                tile_description = self._tile_description
+        else:
+            valid, err_str = self._valid_tile_description(tile_description)
+            if not valid:
+                raise Exception(f"Invalid tile description. {err_str}")
+            self._tile_description = tile_description
+        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
+        self.epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, self.epilogue_functor)
+        operation = GemmOperationUniversal(
+            arch=self.current_cc,
+            tile_description=tile_description,
+            A=tensor_A, B=tensor_B, C=tensor_C,
+            epilogue_functor=self.epilogue_functor,
+            swizzling_functor=self._swizzling_functor,
+        )
+        return operation
+    def compile(self, tile_description: TileDescription = None,
+                alignment_A: int = None, alignment_B: int = None, alignment_C: int = None,
+                print_module: bool = False) -> cutlass_cppgen.backend.GemmOperationUniversal:
+        """
+        Emits and compiles the kernel currently specified. If ``tile_description`` and any
+        of the ``alignment`` parameters are set, the kernel will be chosen using this
+        tile description and alignments. Otherwise, a default tile description and alignment
+        will be used.
+        :param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass_cppgen.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+        :param print_module: whether to print the emitted C++ code
+        :type print_module: bool
+        :return: operation that was compiled
+        :rtype: cutlass_cppgen.backend.GemmOperationUniversal
+        """
+        self.operation = self.construct(tile_description, alignment_A, alignment_B, alignment_C)
+        if print_module:
+            print(self.operation.rt_module.emit())
+        compiler.add_module([self.operation,])
+        return self.operation
+    def _verify_rank(self, tensor):
+        """
+        Verifies that ``tensor`` has rank greater than 1
+        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type tensor: numpy/cupy/torch array/tensor object
+        """
+        if len(tensor.shape) < 2:
+            raise Exception(f"Tensors must be of rank greater than 1. Received tensor of shape: {tensor.shape}")
+    def _get_batch_count(self, A, B, C, D) -> int:
+        """
+        Returns the batch count specified by the tensors A, B, C, and D and verifies that these
+        tensors match in batch size. Presence of a batch dimension is detected by one of the
+        tensors being rank 3. If a batch dimension is present, it must be present in one of
+        operands A, B, or C (but need not be in all), and must be present in D.
+        :param A: tensor A
+        :type A: numpy/cupy/torch array/tensor object
+        :param B: tensor B
+        :type B: numpy/cupy/torch array/tensor object
+        :param C: tensor C
+        :type C: numpy/cupy/torch array/tensor object
+        :param D: tensor D
+        :type D: numpy/cupy/torch array/tensor object
+        :return: tuple of batch count dimensions
+        :rtype: tuple
+        """
+        A_batch = prod(A.shape[:-2]) if len(A.shape) > 2 else 1
+        B_batch = prod(B.shape[:-2]) if len(B.shape) > 2 else 1
+        if 1 not in [A_batch, B_batch]:
+            if A_batch != B_batch:
+                raise Exception(f"Get invalid batch counts: A={A_batch}, B={B_batch}")
+        return max(A_batch, B_batch)
+    def _get_batch_stride(self, tensor) -> int:
+        """
+        Returns the batch stride of ``tensor``. If ``tensor`` is only rank-2, batch stride is 0.
+        :param tensor: tensor object to process
+        :type tensor: numpy/cupy/torch array/tensor object
+        :return: stride between each matrix in the batch
+        :rtype: int
+        """
+        if tensor is not None and len(tensor.shape) > 2:
+            return tensor.shape[-2] * tensor.shape[-1]
+        else:
+            return 0
+    def _get_problem_args(self, A, B, C, D) -> tuple:
+        """
+        Returns the problem size and GEMM universal mode to use for the
+        given operands.
+        :param A: tensor A
+        :type A: numpy/cupy/torch array/tensor object
+        :param B: tensor B
+        :type B: numpy/cupy/torch array/tensor object
+        :param C: tensor C
+        :type C: numpy/cupy/torch array/tensor object
+        :param D: tensor D
+        :type D: numpy/cupy/torch array/tensor object
+        :return: tuple containing the problem size (cutlass_cppgen.shape.GemmCoord), the GEMM mode (cutlass_cppgen.GemmUniversalMode), and the batch count (int)
+        :rtype: tuple
+        """
+        M, K = A.shape[-2:]
+        N = B.shape[-1]
+        mode = GemmUniversalMode.Gemm
+        batch_count = self._get_batch_count(A, B, C, D)
+        returned_batch_count = batch_count
+        # If we are running a batched GEMM in which there is a nonzero batch stride
+        # only for A, then we can fold the batched dimension of A into the M dimension
+        # (i.e., (b, m, k) x (k, n) -> (m*b, k) x (k, n)). This works only if both A
+        # and C are row major. A similar operation can be performed if only B has a nonzero
+        # batch dimension
+        if batch_count > 1:
+            A_row = self._layout_a == cutlass_cppgen.LayoutType.RowMajor
+            B_row = self._layout_b == cutlass_cppgen.LayoutType.RowMajor
+            C_row = self._layout_c == cutlass_cppgen.LayoutType.RowMajor
+            # Consider a Tensor to be batched if its rank is > 2 and
+            # the product of the modes beyond rank 2 equals our pre-determined batch size.
+            batched = lambda x : x is None or (len(x.shape) > 2 and prod(x.shape[:-2]) == batch_count)
+            if batched(A) and not batched(B) and (C is None or batched(C)) and A_row and C_row:
+                M *= batch_count
+                returned_batch_count = 1
+            elif not batched(A) and batched(B) and (C is None or batched(C)) and not B_row and not C_row:
+                N *= batch_count
+                returned_batch_count = 1
+            else:
+                mode = GemmUniversalMode.Batched
+        return GemmCoord(M, N, K), mode, returned_batch_count
+    def _verify_type_and_layout(self, tensor, ref_type, ref_layout, name):
+        """
+        Verifies that ``tensor`` has data type ``ref_type`` and layout ``ref_layout``. An exception
+        is raised if it does not.
+        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type tensor: numpy/cupy/torch array/tensor object
+        :param ref_dtype: data type for the tensor that this object was initialized to
+        :param ref_layout: layout for the tensor that this object was initialized to
+        :param name: identifier of the tensor to verify. Used in raising exceptions
+        :type name: str
+        """
+        dtype, layout = datatypes.get_datatype_and_layout(tensor)
+        if dtype != ref_type or layout != ref_layout:
+            try:
+                # Attempt to transpose the tensor to fit the desired layout
+                tensor = tensor.transpose(-1, -2)
+            except:
+                raise Exception(f'Tensor {name} with type and layout ({dtype}, {layout}) '
+                                f'does not match the expected type and '
+                                f'layout of ({ref_type}, {ref_layout}) and transpose failed.')
+    def run(self, A=None, B=None, C=None, D=None,
+            alpha=None, beta=None, sync: bool = True, print_module: bool = False, visitor_args: dict = None,
+            stream: Optional[cuda.CUstream] = None) -> GemmArguments:
+        """
+        Runs the kernel currently specified. If it has not already been, the kernel is emitted and
+        compiled. Tensors holding operands and outputs of the kernel are sourced either from the
+        ``A``, ``B``, ``C``, ``D``, ``alpha``, and ``beta``
+        parameters provided in this call, or from those
+        passed in on the construction of this object -- one of the two must be specified.
+        By default, this call returns only once the kernel has completed. To launch the kernel
+        and immediately return, set ``sync=False``. In this case, it is the responsibility of the
+        caller to syncrhonize the results of the kernel before attempting to access outputs
+        by calling ``sync()`` on the arguments returned from this call.
+        :param A: tensor representing data type and layout of operand A
+        :param B: tensor representing data type and layout of operand B
+        :param C: tensor representing data type and layout of operand C
+        :param D: tensor representing data type and layout of operand D
+        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+        :param beta: scalar parameter beta from GEMM operation that scales operand C
+        :param sync: whether the call should wait for the kernel to complete before returning
+        :type sync: bool
+        :param print_module: whether to print the emitted C++ code
+        :type print_module: bool
+        :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+        :type stream: :class:`cuda.cuda.CUstream`
+        :return: arguments passed in to the kernel
+        :rtype: cutlass_cppgen.backend.GemmArguments
+        """
+        if not stream:
+            stream = cuda.CUstream(0)
+        super().run_setup()
+        A = self._verify_tensor(A, self.A, self._element_a, self._layout_a, "A")
+        B = self._verify_tensor(B, self.B, self._element_b, self._layout_b, "B")
+        C = self._verify_tensor(C, self.C, self._element_c, self._layout_c, "C")
+        D = self._verify_tensor(D, self.D, self._element_d, self._layout_d, "D")
+        alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
+        beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")
+        is_void_c = self._element_c == DataType.void
+        self._verify_rank(A)
+        self._verify_rank(B)
+        if not is_void_c:
+            self._verify_rank(C)
+        self._verify_rank(D)
+        alignment_a = self.possible_operations.find_alignment(A.shape, self._layout_a, operand="A")
+        alignment_b = self.possible_operations.find_alignment(B.shape, self._layout_b, operand="B")
+        # Set C alignment based on D.shape so as to correctly get an alignment with void-C
+        # kernels, for which `C` is None.
+        alignment_c = self.possible_operations.find_alignment(D.shape, self._layout_c, operand="C")
+        self.compile(self._tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
+                     alignment_C=alignment_c, print_module=print_module)
+        problem_size, mode, batch_count = self._get_problem_args(A, B, C, D)
+        if mode == GemmUniversalMode.Gemm or batch_count == 1:
+            kwargs = {'split_k_slices': 1}
+        else:
+            kwargs = {
+                'batch': batch_count,
+                'batch_strides': {
+                    'A': self._get_batch_stride(A),
+                    'B': self._get_batch_stride(B),
+                    'C': self._get_batch_stride(C),
+                    'D': self._get_batch_stride(D)
+                }
+            }
+        kwargs['stream'] = stream
+        if isinstance(self.epilogue_functor, EpilogueFunctorVisitor):
+            output_op = self.operation.epilogue_type(visitor_args)
+        else:
+            output_op = self.operation.epilogue_type(alpha, beta)
+        arguments = GemmArguments(
+            operation=self.operation, problem_size=problem_size,
+            A=A, B=B, C=C, D=D,
+            output_op=output_op,
+            gemm_mode=mode,
+            **kwargs
+        )
+        self.operation.run(arguments)
+        if sync:
+            arguments.sync()
+        return arguments

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/gemm_grouped.py ADDED Viewed

	@@ -0,0 +1,269 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+    Ease-of-use interface for constructing, compiling, and running GEMMs.
+    The ``GroupedGemm`` interface is meant to allow one to easily instantiate, compile, and run
+    grouped GEMM operations in CUTLASS via Python, without specifying many configuration parameters.
+    Under the hood, the interface will select sensible default parameters for the many template
+    parameters for CUTLASS grouped GEMMs.
+    Note: optimal performance is not to be expected from this interface. To achieve optimal
+    performance, one should specify and tune each configuration parameter.
+    The simplest example of using this interface is the following:
+    .. highlight:: python
+    .. code-block:: python
+        # As, Bs, Cs, and Ds are torch/numpy/cupy tensor objects
+        plan = cutlass_cppgen.op.GroupedGemm(element=cutlass_cppgen.DataType.f16, layout=cutlass_cppgen.LayoutType.RowMajor)
+        plan.run([A0, A1], [B0, B1], [C0, C1], [D0, D1])
+"""
+from __future__ import annotations
+from typing import Optional
+from cutlass_library import DataTypeSize
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+from cutlass_cppgen.backend.gemm_operation import (
+    GemmGroupedArguments,
+    GemmOperationGrouped,
+)
+from cutlass_cppgen.backend.library import (
+    SchedulerMode,
+    TensorDescription,
+    TileDescription,
+)
+from cutlass_cppgen.op.gemm import Gemm
+from cutlass_cppgen.shape import GemmCoord
+from cutlass_cppgen.utils import check, datatypes
+class GroupedGemm(Gemm):
+    """
+    Constructs a ``GroupedGemm`` object.
+    The data types and layouts of operands A, B, and C, along with the data type of output D
+    and that used for accumulation, are bound to the ``GroupedGemm`` object throughout its lifetime --
+    these are not to be changed after a ``GroupedGemm`` has been constructed.
+    The constructor has optional parameters for flexibly setting these parameters. Please see the constructor
+    for ``Gemm`` for examples of these.
+    :param cc: compute capability of device to generate kernels for
+    :type cc: int
+    :param A: tensor representing data type and layout of operands A
+    :param B: tensor representing data type and layout of operands B
+    :param C: tensor representing data type and layout of operands C
+    :param D: tensor representing data type and layout of operands D
+    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+    :param beta: scalar parameter beta from GEMM operation that scales operand C
+    :param element_accumulator: data type to be used in accumulation of the product of operands A and B
+    :type element_accumulator: cutlass_cppgen.DataType
+    :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type
+    :type element: cutlass_cppgen.DataType
+    :param layout: generic layout type to be used for operands A, B, C, and D
+    :type layout: cutlass_cppgen.LayoutType
+    :param element_A: data type to be used for operand A
+    :type element_A: cutlass_cppgen.DataType
+    :param element_B: data type to be used for operand B
+    :type element_B: cutlass_cppgen.DataType
+    :param element_C: data type to be used for operand C
+    :type element_C: cutlass_cppgen.DataType
+    :param element_D: data type to be used for operand D
+    :type element_D: cutlass_cppgen.DataType
+    :type layout_A: layout of operand A
+    :param layout_A: cutlass_cppgen.LayoutType
+    :type layout_B: layout of operand B
+    :param layout_B: cutlass_cppgen.LayoutType
+    :type layout_C: layout of operand C
+    :param layout_C: cutlass_cppgen.LayoutType
+    :type layout_D: layout of operand D
+    :param layout_D: cutlass_cppgen.LayoutType
+    """
+    def __init__(
+        self, A=None, B=None, C=None, D=None,
+        alpha=1.0, beta=0.0, element_accumulator=None,
+        element=None, layout=None,
+        element_A=None, element_B=None, element_C=None, element_D=None,
+        layout_A=None, layout_B=None, layout_C=None,
+        cc: int = None,
+    ):
+        super().__init__(
+            A=A, B=B, C=C, D=D,
+            alpha=alpha, beta=beta,
+            element_accumulator=element_accumulator,
+            element=element, layout=layout,
+            element_A=element_A, element_B=element_B,
+            element_C=element_C, element_D=element_D,
+            layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
+            cc=cc
+        )
+        # Grouped GEMM specializations for SM90 are currently unavailable. Revert to using SM80
+        if self.current_cc in [90, 100, 101, 103]:
+            self._reset_options(80)
+            self._reset_operations(reset_epilogue=False)
+        self.name = "grouped_gemm"
+    @Gemm.swizzling_functor.setter
+    def swizzling_functor(self, swizzling_functor):
+        """
+        Sets the swizzling functor to the type specified by `swizzling_functor`
+        """
+        raise Exception('Grouped GEMM does not currently support different swizzling functors')
+    def construct(self, tile_description: TileDescription = None,
+                  alignment_A: int = None,
+                  alignment_B: int = None,
+                  alignment_C: int = None) -> GemmOperationGrouped:
+        """
+        Constructs a ``cutlass_cppgen.backend.GemmOperationGrouped`` based on the input parameters and current
+        kernel specification of the ``Gemm`` object.
+        :param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass_cppgen.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+        :return: operation that was constructed
+        :rtype: cutlass_cppgen.backend.GemmOperationGrouped
+        """
+        alignment_A = check.alignment_or_default(alignment_A, max(self.possible_operations.alignments("A")))
+        alignment_B = check.alignment_or_default(alignment_B, max(self.possible_operations.alignments("B")))
+        alignment_C = check.alignment_or_default(alignment_C, max(self.possible_operations.alignments("C")))
+        self.epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, self.epilogue_functor)
+        tensor_A = TensorDescription(self._element_a, self._layout_b, alignment_A)
+        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
+        if tile_description is None:
+            op = self.possible_operations.operations(alignment_A, alignment_B, alignment_C, self._math_operation)[0]
+            tile_description = datatypes.td_from_profiler_op(op)
+        else:
+            valid, err_str = self._valid_tile_description(tile_description)
+            if not valid:
+                raise Exception(f"Invalid tile description. {err_str}")
+            self.tile_description = tile_description
+        operation = GemmOperationGrouped(
+            arch=self.current_cc,
+            tile_description=tile_description,
+            A=tensor_A, B=tensor_B, C=tensor_C,
+            epilogue_functor=self.epilogue_functor,
+            swizzling_functor=self._swizzling_functor,
+            precompute_mode=SchedulerMode.Device)
+        return operation
+    def run(self, A, B, C, D,
+            alpha=None, beta=None, sync: bool = True,
+            print_module: bool = False,
+            stream: Optional[cuda.CUstream] = None) -> GemmGroupedArguments:
+        """
+        Runs the kernel currently specified.
+        By default, this call returns only once the kernel has completed. To launch the kernel
+        and immediately return, set ``sync=False``. In this case, it is the responsibility of the
+        caller to syncrhonize the results of the kernel before attempting to access outputs
+        by calling ``sync()`` on the arguments returned from this call.
+        :param A: list of tensors representing data type and layout of operand A
+        :type A: list
+        :param B: list of tensors representing data type and layout of operand B
+        :type B: list
+        :param C: list of tensors representing data type and layout of operand C
+        :type C: list
+        :param D: list of tensors representing data type and layout of operand D
+        :type D: list
+        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+        :param beta: scalar parameter beta from GEMM operation that scales operand C
+        :param sync: whether the call should wait for the kernel to complete before returning
+        :type sync: bool
+        :param print_module: whether to print the emitted C++ code
+        :type print_module: bool
+        :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+        :type stream: :class:`cuda.cuda.CUstream`
+        :return: arguments passed in to the kernel
+        :rtype: cutlass_cppgen.backend.GemmGroupedArguments
+        """
+        if not stream:
+            stream = cuda.CUstream(0)
+        super().run_setup()
+        if len(A) != len(B) or len(A) != len(C) or len(A) != len(D):
+            raise Exception("Lengths of A, B, C, and D lists must be equal")
+        problem_sizes = []
+        As, Bs, Cs, Ds = ([None] * len(A) for _ in range(4))
+        for i in range(len(A)):
+            As[i] = self._verify_tensor(A[i], self.A, self._element_a, self._layout_a, "A")
+            Bs[i] = self._verify_tensor(B[i], self.B, self._element_b, self._layout_b, "B")
+            Cs[i] = self._verify_tensor(C[i], self.C, self._element_c, self._layout_c, "C")
+            Ds[i] = self._verify_tensor(D[i], self.D, self._element_d, self._layout_d, "D")
+            problem_sizes.append(GemmCoord(A[i].shape[0], B[i].shape[1], A[i].shape[1]))
+        alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
+        beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")
+        alignment_a = min((self.possible_operations.find_alignment(A.shape, self._layout_a, operand="A") for A in As))
+        alignment_b = min((self.possible_operations.find_alignment(B.shape, self._layout_b, operand="B") for B in Bs))
+        alignment_c = min((self.possible_operations.find_alignment(C.shape, self._layout_c, operand="C") for C in Cs))
+        self.compile(self.tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
+                     alignment_C=alignment_c, print_module=print_module)
+        arguments = GemmGroupedArguments(
+            operation=self.operation,
+            problem_sizes=problem_sizes,
+            A=As, B=Bs, C=Cs, D=Ds,
+            output_op=self.operation.epilogue_type(alpha, beta),
+            stream=stream
+        )
+        self.operation.run(arguments)
+        if sync:
+            arguments.sync()
+        return arguments

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/op.py ADDED Viewed

	@@ -0,0 +1,431 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Base operation used for defining high-level CUTLASS operations (e.g., GEMM, Conv2d)
+"""
+from bisect import bisect_left
+from cutlass_library import (
+    DataType,
+    DataTypeSize,
+    MathOperation,
+    OperationKind,
+    SharedMemPerCC
+)
+import cutlass_cppgen
+from cutlass_cppgen import get_option_registry
+from cutlass_cppgen.backend.evt import EpilogueFunctorVisitor
+from cutlass_cppgen.backend.evt.passes.util import cc_map
+from cutlass_cppgen.backend.utils.device import device_cc
+from cutlass_cppgen.epilogue import get_activations, get_activation_epilogue, identity
+from cutlass_cppgen.library_defaults import KernelsForDataType, _generator_ccs
+from cutlass_cppgen.swizzle import get_swizzling_functors
+from cutlass_cppgen.utils import datatypes, check
+class OperationBase:
+    """
+    Base operation used for defining high-level CUTLASS operations (e.g., GEMM, Conv2d)
+    """
+    def __init__(self, cc: int = None, kernel_cc: int = None, operation_kind = OperationKind.Gemm):
+        """
+        :param cc: compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90
+        :type cc: int
+        :param kernel_cc: compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80
+        :type kernel_cc: int
+        :param operation_kind: class of operation that will be performed (e.g., GEMM, Conv)
+        :type operation_kind: cutlass_library.OperationKind
+        """
+        self.operation_kind = operation_kind
+        self.cc = cc if cc is not None else device_cc()
+        self.specified_kernel_cc = kernel_cc is not None
+        self.current_cc = kernel_cc if kernel_cc is not None else self._find_closest_cc(self.cc)
+        self.tile_description = None
+        self._math_operation = None
+        self.options = get_option_registry().options_for_cc(self.current_cc, operation_kind)
+        if self.options is None:
+            raise Exception(f"Invalid or unsupported compute capability: {self.current_cc}")
+        # Default activation function: identity
+        self._activation = identity
+    def _find_closest_cc(self, cc: int) -> int:
+        """
+        Returns the closest CC in _generator_ccs less than or equal to `cc`
+        :param cc: compute capability to query
+        :type cc: int
+        :returns: closest CC in _generator_ccs less than or equal to `cc`
+        :rtype: int
+        """
+        if cc in _generator_ccs:
+            return cc
+        # Find closest CC lower than this CC
+        idx = bisect_left(_generator_ccs, cc)
+        if idx == 0:
+            raise Exception(f'No valid CC to fall back to for {cc}')
+        return _generator_ccs[idx-1]
+    def activations(self) -> list:
+        """
+        Returns possible activation functions that can be used
+        :return: list of activation functions that can be used
+        :rtype: list
+        """
+        return get_activations()
+    def swizzling_functors(self) -> list:
+        """
+        Returns possible swizzling functions that can be used
+        :return: list of swizzling functions that can be used
+        :rtype: list
+        """
+        return get_swizzling_functors()
+    def _reset_options(self, cc: int):
+        """
+        Resets the kernel options based on cc
+        :param cc: compute capability to reset to
+        :type cc: int
+        """
+        if cc != self.current_cc:
+            if cc not in _generator_ccs:
+                raise Exception(f'Invalid CC for CUTLASS kernels: {cc}.')
+            self.current_cc = cc
+            self.options = get_option_registry().options_for_cc(self.current_cc, self.operation_kind)
+    def _verify_scalar(self, scalar, ref_scalar, ref_dtype, name):
+        """
+        Verifies the following properties:
+            1) Either ``scalar`` or ``ref_scakar`` must be set (i.e., not ``None``)
+            2) If ``scalar`` is not ``None``, its datatype must match matches the current version
+               set by the plan (i.e., those in ``ref_dtype``)
+        If either of these properties does not hold, an exception is raised. If these properties hold and
+        ``scalar`` is not ``None``, ``scalar`` is returned. Otherwise, ``ref_scalar`` is returned.
+        :param scalar: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type scalar: numpy/cupy/torch scalar
+        :param ref_scalar: object representing a tensor passed in on construction of this object, or ``None`` if no tensor was passed in
+        :type ref_scalar: numpy/cupy/torch scalar
+        :param ref_dtype: data type for the scalar that this object was initialized to
+        :param name: identifier of the scalar to verify. Used in raising exceptions
+        :type name: str
+        :return: valid scalar to use
+        :rtype: numpy/cupy/torch scalar
+        """
+        if scalar is None:
+            if ref_scalar is None:
+                raise Exception(f"Scalar {name} must be set.")
+            return ref_scalar
+        if hasattr(scalar, "dtype"):
+            dtype = datatypes.library_type(scalar.dtype)
+            if dtype != ref_dtype:
+                raise Exception(
+                    f"Tensor {name} with type {dtype} does not match expected type {ref_dtype}."
+                )
+        return scalar
+    def _verify_tensor(self, tensor, ref_tensor, ref_dtype, ref_layout, name):
+        """
+        Verifies the following properties:
+            If ref_dtype is not void:
+                1) Either ``tensor`` or ``ref_tensor`` must be set (i.e., not ``None``)
+                2) If ``tensor`` is not ``None``, its datatype and layout must match matches the current versions
+                set by the plan (i.e., those in ``ref_dtype`` and ``ref_layout``)
+            If ref_dtype is void:
+                Neither ``tensor`` nor ``ref_tensor`` are set
+        If either of these properties does not hold, an exception is raised. If these properties hold and
+        ``tensor`` is not ``None``, ``tensor`` is returned. Otherwise, ``ref_tensor`` is returned.
+        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type tensor: numpy/cupy/torch array/tensor object
+        :param ref_tensor: object representing a tensor passed in on construction of this object, or ``None`` if no tensor was passed in
+        :type ref_tensor: numpy/cupy/torch array/tensor object
+        :param ref_dtype: data type for the tensor that this object was initialized to
+        :param ref_layout: layout for the tensor that this object was initialized to
+        :param name: identifier of the tensor to verify. Used in raising exceptions
+        :type name: str
+        :return: valid tensor object to use
+        :rtype: numpy/cupy/torch array/tensor object
+        """
+        if ref_dtype == DataType.void:
+            if tensor is not None or ref_tensor is not None:
+                raise Exception("Operands with element DataType.void must not be provided a tensor")
+            return None
+        if tensor is None:
+            if ref_tensor is None:
+                raise Exception(f"Tensor {name} must be set.")
+            return ref_tensor
+        self._verify_type_and_layout(tensor, ref_dtype, ref_layout, name)
+        return tensor
+    @property
+    def opclass(self) -> cutlass_cppgen.OpcodeClass:
+        """
+        Returns the opcode class currently in use
+        :return: opcode class currently in use
+        :rtype: cutlass_cppgen.OpcodeClass
+        """
+        return self.op_class
+    @opclass.setter
+    def opclass(self, oc: cutlass_cppgen.OpcodeClass):
+        if isinstance(oc, str):
+            oc = datatypes.getattr_enum(cutlass_cppgen.OpcodeClass, oc)
+        if oc in self.possible_op_classes:
+            self.op_class = oc
+        else:
+            raise Exception(
+                f'Unsupported operation class {oc} for CC {self.cc} and data type combination '
+                f'({self._element_a}, {self._element_b}, {self._element_accumulator}) and '
+                f'layout combination ({self._layout_a}, {self._layout_b}).')
+        # Changing the op class also changes the possible operations available. Reset these.
+        self.possible_operations = self.options.operations(
+            self.op_class, self._element_a, self._element_b,
+            self._element_accumulator, self._layout_a, self._layout_b, self._math_operation)
+        # Changing the op class changes the elements per access in the epilogue. Reset this.
+        if self.epilogue_functor is not None:
+            self.epilogue_functor = self._reset_epilogue_functor_alignment(self._elements_per_access(), self.epilogue_functor)
+    @property
+    def math_operation(self) -> cutlass_cppgen.MathOperation:
+        """
+        Returns the math operation currently in use
+        :return: math operation currently in use
+        :rtype: cutlass_cppgen.MathOperation
+        """
+        return self._math_operation
+    @math_operation.setter
+    def math_operation(self, mo: cutlass_cppgen.MathOperation):
+        if isinstance(mo, str):
+            mo = datatypes.getattr_enum(cutlass_cppgen.MathOperation, mo)
+        if not self.specified_kernel_cc:
+            if self.current_cc in [90, 100, 101, 103]:
+                # CUTLASS 3.0 kernels do not use different math operations. If one is specified, we
+                # revert to using a CUTLASS 2.x kernel by using SM80-tagged kernels.
+                cutlass_cppgen.logger.warning("Reverting to using SM80-tagged kernel. Opclass may change.")
+                self._reset_options(80)
+                self._reset_operations(reset_epilogue=False)
+        elif self.current_cc in [90, 100, 101, 103]:
+            raise Exception("CUTLASS 3.0 kernels do not use different math operations. "
+                "To use 2.x kernels with a specific math operation, do not set the `kernel_cc`"
+                "parameter when constructing the plan.")
+        self._math_operation = mo
+        self._reset_operations()
+    def _elements_per_access(self):
+        if self.op_class == cutlass_cppgen.OpcodeClass.Simt:
+            return 1
+        elif self._element_c != DataType.void:
+            return 128 // DataTypeSize[self._element_c]
+        else:
+            return 128 // max(self.possible_operations.alignments("C"))
+    def _create_epilogue_functor_activation(self, activation):
+        """
+        Returns the epilogue functor with given activation function
+        """
+        if self.epilogue_functor is None:
+            elements_per_access = self._elements_per_access()
+        else:
+            elements_per_access = self.epilogue_functor.epilogue_vector_length
+        if not self.specified_kernel_cc:
+            if self.current_cc in [90, 100, 101, 103] and activation != identity:
+                # CUTLASS 3.0 kernels in Python currently only support identity activation. If one requests a non-identity activation,
+                # revert to using a CUTLASS 2.x kernel by using SM80-tagged kernels.
+                cutlass_cppgen.logger.warning("Reverting to using SM80-tagged kernel. Opclass may change.")
+                if self._element_c != self._element_d:
+                    raise Exception("CUTLASS 2.x kernels require element C to be the same as element D")
+                self._reset_options(80)
+                self._reset_operations(reset_epilogue=False)
+            elif (self.cc in [90, 100, 101, 103] and self.current_cc not in [90, 100, 101, 103] and activation == identity and self._math_operation is None):
+                # SM80 fallback kernels are currently used. Since an identity activation is requested,
+                # we can switch back to using SM90 kernels.
+                self._reset_options(self.cc)
+                self._reset_operations(reset_epilogue=False)
+        else:
+            if self.current_cc in [90, 100, 101, 103] and activation != identity:
+                raise Exception("Epilogues with elementwise fusion are not currently supported "
+                                "in the Python interface for 3.x kernels. To use 2.x kernels "
+                                "with fused elementwise epilogues, do not set the `kernel_cc` "
+                                "parameter when constructing the plan.")
+        return get_activation_epilogue(
+            activation,
+            self._element_d,
+            elements_per_access,
+            self._element_accumulator,
+            self._element_accumulator,
+        )
+    def _reset_epilogue_functor_activation(self, activation):
+        """
+        Set the epilogue functor based on the provided activation function
+        """
+        self.epilogue_functor = self._create_epilogue_functor_activation(activation)
+    def _reset_epilogue_functor_alignment(self, alignment, epilogue_functor):
+        """
+        Reset the alignment of the current epilogue functor based on alignment C
+        """
+        if isinstance(epilogue_functor, EpilogueFunctorVisitor):
+            return epilogue_functor
+        if epilogue_functor is None or not hasattr(epilogue_functor, 'activation_functor'):
+            # Identity epilogue does not have 'activation_functor'
+            activation = identity
+        else:
+            activation = epilogue_functor.activation_functor
+        epilogue_functor = get_activation_epilogue(
+            activation,
+            self._element_d,
+            alignment,
+            self._element_accumulator,
+            self._element_accumulator,
+        )
+        return epilogue_functor
+    @property
+    def activation(self):
+        """
+        Returns the type of the current activation function used
+        """
+        if hasattr(self.epilogue_functor, "activation_functor"):
+            return self.epilogue_functor.activation_functor
+        else:
+            return identity
+    @activation.setter
+    def activation(self, act):
+        """
+        Sets the type of the activation function to use
+        Activation can come with a set of arguments
+        :param act: type of activation function to use
+        :type act: str or tuple. e.g. "relu", ("leaky_relu", 0.01)
+        """
+        if isinstance(act, tuple):
+            if isinstance(act[0], str):
+                act_fn = getattr(cutlass_cppgen.backend.epilogue, act[0])
+            else:
+                act_fn = act[0]
+            self._reset_epilogue_functor_activation(act_fn)
+            self._activation_args = act[1]
+            self._activation = act[0]
+        else:
+            if isinstance(act, str):
+                act = getattr(cutlass_cppgen.backend.epilogue, act)
+            self._reset_epilogue_functor_activation(act)
+            self._activation = act
+    @property
+    def epilogue_visitor(self):
+        """
+        Return the epilogue functor
+        """
+        return self.epilogue_functor
+    @epilogue_visitor.setter
+    def epilogue_visitor(self, visitor):
+        """
+        Create the epilogue visitor
+        """
+        self.epilogue_functor = EpilogueFunctorVisitor(cc_map[self.cc], visitor)
+        # The epilogue_functor may consume too much shared memory
+        # Reset the possible operations
+        if self.cc not in [90, 100, 101, 103]:
+            # The shared memory is only a concern for sm90+ epilogue
+            # In sm80, the epilogue and mainloop share the shared memory
+            return
+        datatype_comb = self.possible_operations.datatype_comb
+        layout_comb = self.possible_operations.layout_comb
+        new_possible_operations = KernelsForDataType(datatype_comb, layout_comb)
+        for operation in self.possible_operations.all_operations:
+            td = datatypes.td_from_profiler_op(operation)
+            # Filter invalid epilogue schedules
+            if cc_map[self.cc] == 90 and td.epilogue_schedule not in [
+                cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized,
+                cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative]:
+                continue
+            epilogue_smem_bytes = self.epilogue_functor.get_smem_size(td)
+            # Verify the maximum number of mainloop stages
+            mainloop_smem_per_stage = check.calculate_smem_usage_per_stage(td, OperationKind.Gemm)
+            smem_capacity_bytes = SharedMemPerCC[self.cc] << 10
+            mainloop_stages = (smem_capacity_bytes - epilogue_smem_bytes) // mainloop_smem_per_stage
+            if mainloop_stages < 2:
+                # Mainloop stages must >= 2
+                continue
+            new_possible_operations.add(operation)
+        if len(new_possible_operations.all_operations) == 0:
+            raise RuntimeError(
+                "The epilogue consumes too much shared memory. "
+                "No valid tile description is found in the generator.")
+        self.possible_operations = new_possible_operations
+    def run_setup(self):
+        """
+        Steps that must be taken before caling `plan.run()`
+        """
+        # Initialize the memory pool if, if not already done
+        cutlass_cppgen.get_memory_pool()

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/shape.py ADDED Viewed

	@@ -0,0 +1,184 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for expressing shapes
+"""
+from cutlass_library import (
+    ConvMode,
+    ConvKind,
+    LayoutType
+)
+from cutlass_cppgen.backend.c_types import (
+    Conv2DProblemSize_,
+    GemmCoord_,
+    GemmCoordBatched_
+)
+class MatrixCoord:
+    def __init__(self, row, col):
+        self._row = row
+        self._col = col
+    @property
+    def row(self):
+        return self._row
+    @property
+    def column(self):
+        return self._col
+    def leading_dimension(self, layout: LayoutType) -> int:
+        """
+        Returns the leading dimension for a matrix with layout ``layout`` and shape provided by the MatrixCoord.
+        :param layout: layout of matrix
+        :type layout: cutlass_library.LayoutType
+        :returns: leading dimension
+        :rtype: int
+        """
+        if layout == LayoutType.RowMajor:
+            return self._col
+        elif layout == LayoutType.ColumnMajor:
+            return self._row
+        else:
+            raise Exception(f'Unsupported layout for leading dimension calculation: {layout}')
+class GemmCoord:
+    def __init__(self, m: int, n: int, k: int):
+        self._m = m
+        self._n = n
+        self._k = k
+    @property
+    def m(self) -> int:
+        return self._m
+    @property
+    def n(self) -> int:
+        return self._n
+    @property
+    def k(self) -> int:
+        return self._k
+    @property
+    def mk(self) -> MatrixCoord:
+        return MatrixCoord(self._m, self._k)
+    @property
+    def mn(self) -> MatrixCoord:
+        return MatrixCoord(self._m, self._n)
+    @property
+    def kn(self) -> MatrixCoord:
+        return MatrixCoord(self._k, self._n)
+    @property
+    def ctype(self) -> GemmCoord_:
+        return GemmCoord_(self._m, self._n, self._k)
+    def batched_ctype(self, batch_count: int) -> GemmCoordBatched_:
+        return GemmCoordBatched_(self._m, self._n, self._k, batch_count)
+class Conv2DProblemSize:
+    def __init__(
+        self, n: int, h: int, w: int, c: int,
+        k: int, r: int, s: int, c_: int,
+        pad_h: int, pad_w: int, stride_h: int, stride_w: int,
+        dilation_h: int, dilation_w: int, mode: ConvMode=ConvMode.CrossCorrelation,
+        split_k_slices: int=1, groups: int=1):
+        self.N = n
+        self.H = h
+        self.W = w
+        self.C = c
+        self.K = k
+        self.R = r
+        self.S = s
+        self.pad_h = pad_h
+        self.pad_w = pad_w
+        self.stride_h = stride_h
+        self.stride_w = stride_w
+        self.dilation_h = dilation_h
+        self.dilation_w = dilation_w
+        self.mode = int(mode)
+        self.split_k_slices = split_k_slices
+        self.groups = groups
+        self.P = ((h + pad_h * 2 - r * dilation_h) // stride_h) + 1
+        self.Q = ((w + pad_w * 2 - s * dilation_w) // stride_w) + 1
+    @property
+    def ctype(self) -> Conv2DProblemSize_:
+        return Conv2DProblemSize_(self)
+    def implicit_gemm_size(self, kind: ConvKind):
+        if kind == ConvKind.Fprop:
+            return GemmCoord(
+                self.N * self.P * self.Q,
+                self.K,
+                self.R * self.S * self.C // self.groups
+            )
+        elif kind == ConvKind.Dgrad:
+            return GemmCoord(
+                self.N * self.H * self.W,
+                self.C,
+                self.R * self.S * self.K
+            )
+        elif kind == ConvKind.Wgrad:
+            return GemmCoord(
+                self.K,
+                self.R * self.S * self.C,
+                self.N * self.P * self.Q
+            )
+    @staticmethod
+    def from_sizes(input_size, weight_size):
+        K, R, S, _ = weight_size
+        pad_h = R // 2
+        pad_w = S // 2
+        stride_h = 1
+        stride_w = 1
+        dilation_h = 1
+        dilation_w = 1
+        return Conv2DProblemSize(
+            *input_size,
+            *weight_size,
+            pad_h, pad_w,
+            stride_h, stride_w,
+            dilation_h, dilation_w
+        )

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/swizzle.py ADDED Viewed

	@@ -0,0 +1,65 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Registry of swizzling functions
+"""
+from cutlass_library import SwizzlingFunctor
+IdentitySwizzle1 = SwizzlingFunctor.Identity1
+IdentitySwizzle2 = SwizzlingFunctor.Identity2
+IdentitySwizzle4 = SwizzlingFunctor.Identity4
+IdentitySwizzle8 = SwizzlingFunctor.Identity8
+HorizontalSwizzle = SwizzlingFunctor.Horizontal
+ThreadblockSwizzleStreamK = SwizzlingFunctor.StreamK
+StridedDgradIdentitySwizzle1 = SwizzlingFunctor.StridedDgradIdentity1
+StridedDgradIdentitySwizzle4 = SwizzlingFunctor.StridedDgradIdentity4
+StridedDgradHorizontalSwizzle = SwizzlingFunctor.StridedDgradHorizontal
+_swizzling_functors = [
+    IdentitySwizzle1,
+    IdentitySwizzle2,
+    IdentitySwizzle4,
+    IdentitySwizzle8,
+    HorizontalSwizzle,
+    ThreadblockSwizzleStreamK,
+    StridedDgradIdentitySwizzle1,
+    StridedDgradIdentitySwizzle4,
+    StridedDgradHorizontalSwizzle,
+]
+def get_swizzling_functors():
+    return _swizzling_functors

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from cutlass_cppgen.utils.check import (
+    alignment_or_default,
+    calculate_smem_usage,
+    calculate_smem_usage_per_stage,
+    valid_cluster_shape,
+    valid_schedule,
+    valid_stage_count,
+    update_alignment,
+)

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/check.py ADDED Viewed

	@@ -0,0 +1,262 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utility functions for checking constraints on kernels and calculating kernel attributes
+"""
+import ctypes
+from cutlass_library import DataTypeSize, KernelScheduleSuffixes, OperationKind, SharedMemPerCC
+import cutlass_cppgen
+from cutlass_cppgen.backend.library import TileDescription
+def calculate_smem_usage_per_stage(td: TileDescription, operation_kind: OperationKind) -> int:
+    """
+    Returns the amount of shared memory in bytes consumed in a single stage of a kernel.
+    :param td: tile description to compute shared memory of
+    :type td: TileDescription
+    :param operation_kind: identifier for the type of operation being performed
+    :type operation_kind: cutlass_library.OperationKind
+    :return: number of bytes of shared memory consumed by a single stage
+    :rtype: int
+    """
+    m, n, k = td.blackwell_threadblock_shape
+    if td.is_2sm:
+        m //= 2
+    if operation_kind == OperationKind.Gemm:
+        stage_barrier_bytes = 32
+        return (
+            (DataTypeSize[td.math_instruction.element_a] * m * k // 8)
+            + (DataTypeSize[td.math_instruction.element_b] * k * n // 8)
+            + stage_barrier_bytes
+        )
+    else:
+        raise Exception(f"No available shared memory calculation for operation kind {operation.operation_kind}")
+def calculate_smem_usage(operation) -> int:
+    """
+    Returns the amount of shared memory in bytes consumed by a kernel.
+    :return: number of bytes of shared memory consumed by the operation
+    :return: int
+    """
+    _per_stage = calculate_smem_usage_per_stage(operation.tile_description, operation.operation_kind)
+    return _per_stage * operation.tile_description.stages
+def valid_stage_count(
+    cc: int,
+    kernel_cc: int,
+    td: TileDescription,
+    element_C: cutlass_cppgen.DataType = None,
+    element_D: cutlass_cppgen.DataType = None,
+    verbose: bool = True) -> tuple:
+    """
+    Checks whether a device with `cc` supports the number of stages within `tile_description`, both
+    based on raw limits on the number of stages and based on shared memory capacity
+    :param cc: compute capability of device in question
+    :type cc: int
+    :param kernel_cc: compute capability that the kernel targets (corresponding to the arch::SMxy tag in CUTLASS)
+    :type kernel_cc: int
+    :param td: tile description to check
+    :type td: TileDescription
+    :param element_C: data type of operand C
+    :type element_C: cutlass_cppgen.DataType
+    :param element_D: data type of operand D
+    :type element_D: cutlass_cppgen.DataType
+    :param verbose: whether to log warnings
+    :type verbose: bool
+    :return: tuple with the first element indicating whether the provided tile description is
+             valid for the provided device and the second element being an error message
+    :rtype: tuple
+    """
+    if kernel_cc in [90, 100, 101, 103]:
+        if (td.stages is None or td.stages == 0):
+            # Stage count of None or 0 for SM90 indicates that the CollectiveBuilder automatically
+            # determines the stage count to use. Thus, all settings are valid in these scenarios.
+            return (True, "")
+        elif verbose:
+            cutlass_cppgen.logger.warning(
+                "Setting an explicit stage count for SM90 kernels currently may "
+                "result in compilation errors if the combination of tile shape, "
+                "stage count, and shared memory requirement of the epilogue exceeds "
+                "the available shared memory per SM.")
+    if td.stages <= 0:
+        return (False, f"Stage counts must be positive integers. Tile description has stage count of {td.stages}.")
+    if cc < 80 and td.stages != 2:
+        return (False, f"Tile description has stage count of {td.stages}, "
+                       f"but only 2 stages are supported on SM{cc}.")
+    # The calculation below does not consider shared memory used by the epilogue and, thus,
+    # only catches cases in which the mainloop exceeds the device's shared memory capacity.
+    # This is not a concern for CUTLASS 2.x kernels, for which the shared memory of the
+    # mainloop and epilogue is shared.
+    smem_per_stage = calculate_smem_usage_per_stage(td, OperationKind.Gemm)
+    smem_usage_mainloop = (smem_per_stage * td.stages)
+    smem_arch = SharedMemPerCC[cc] << 10
+    if smem_usage_mainloop > smem_arch:
+        return ( False,
+            "Configuration uses too much shared memory. Consider reducing stage count or tile shape.\n"
+            f"Details:\n"
+            f"Mainloop uses {smem_per_stage} bytes of shared memory per stage, and "
+            f"{td.stages} stages for a total of {smem_usage_mainloop} bytes.\n"
+            f"The maxmium amount of shared memory that can be used per block on CC {cc} is {smem_arch}.")
+    return (True, "")
+def valid_cluster_shape(cc: int, cluster_shape: list) -> tuple:
+    """
+    Checks whether a device with `cc` supports a thread block cluster of shape `cluster_shape`.
+    :param cc: compute capability of device in question
+    :type cc: int
+    :param cluster_shape: dimensions of thread block cluster shape to check
+    :type cluster_shape: list
+    :return: tuple with the first element indicating whether the provided cluster shape is
+             valid for the provided device and the second element being an error message
+    :rtype: tuple
+    """
+    if cc < 90 or cc in [120, 121]:
+        if cluster_shape != [1, 1, 1]:
+            return (False,
+                    f"Cluster shape for pre-SM90 architectures and SM 120 and 121 must be [1, 1, 1]. Received cluster shape of "
+                    f"{cluster_shape} for SM{cc}.")
+        else:
+            return (True, "")
+    if len(cluster_shape) != 3:
+        return (False,
+                f"Cluster shapes must be rank-3. Received {cluster_shape} (rank {len(cluster_shape)}")
+    if cluster_shape[2] != 1:
+        return (False,
+                "CUTLASS kernels currently require the third dimension of cluster shape to be 1. "
+                f"Received cluster shape of {cluster_shape}.")
+    return (True, "")
+def valid_schedule(
+    cc: int,
+    kernel_schedule: cutlass_cppgen.KernelScheduleType,
+    epilogue_schedule: cutlass_cppgen.EpilogueScheduleType,
+    tile_scheduler: cutlass_cppgen.TileSchedulerType) -> tuple:
+    """
+    Checks that the kernel and epilogue schedules passed in are a valid combination for
+    a device of compute capability ``cc``.
+    :param cc: compute capability of device in question
+    :type cc: int
+    :param kernel_schedule: kernel schedule type
+    :type kernel_schedule: cutlass_cppgen.KernelScheduleType
+    :param epilogue_schedule: epilogue schedule type
+    :type epilogue_schedule: cutlass_cppgen.EpilogueScheduleType
+    :param tile_scheduler: tile scheduler type
+    :type tile_scheduler: cutlass_cppgen.TileSchedulerType
+    :return: tuple with the first element indicating whether the provided schedules are
+             valid for the provided device and the second element being an error message
+    :rtype: tuple
+    """
+    kernel_auto = (kernel_schedule == cutlass_cppgen.KernelScheduleType.ScheduleAuto)
+    epilogue_auto = (epilogue_schedule == cutlass_cppgen.EpilogueScheduleType.ScheduleAuto)
+    tile_scheduler_default = (tile_scheduler == cutlass_cppgen.TileSchedulerType.Default)
+    if (cc < 90 or cc in [120, 121]) and not (kernel_auto and epilogue_auto and tile_scheduler_default):
+        return (False, "Non-default schedules are only supported on SM90 and beyond (excluding SM120 and SM121)")
+    if cc == 90 and ((kernel_auto and not epilogue_auto) or (not kernel_auto and epilogue_auto)):
+        return (False, "Kernel and epilogue schedules must either both be auto or neither be auto")
+    if not tile_scheduler_default:
+        cooperative_kernels = [cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative,
+                               cutlass_cppgen.KernelScheduleType.CpAsyncWarpSpecializedCooperative]
+        if cc == 90 and (tile_scheduler == cutlass_cppgen.TileSchedulerType.StreamK) and (kernel_schedule not in cooperative_kernels):
+            return (False, "Stream-K tile scheduler is currently only supported with the cooperative kernel schedule")
+    return (True, "")
+def alignment_or_default(alignment_provided: int, default_alignment: int) -> int:
+    """
+    Returns `alignment_provided` if it is set, otherwise `default_alignment` and checks
+    that `alignment_provided` does not exceed `default_alignment`.
+    :param alignment_provided: alignment preference specified. Can be None.
+    :type alignment_provided: int
+    :param default_alignment: alignment to use if `alignment_provided` is None
+    :type default_alignment: int
+    :return: alignment to use
+    :rtype: int
+    """
+    if alignment_provided is not None:
+        if alignment_provided > default_alignment:
+            raise Exception(f"Alignment {alignment_provided} exceeds the maximum supported of {default_alignment}.")
+        return alignment_provided
+    return default_alignment
+def update_alignment(alignment_provided:int, default_alignment: int) -> int:
+    """
+    Returns `alignment_provided` if it is set, otherwise `default_alignment` and checks
+    that `alignment_provided` does not exceed `default_alignment`.
+    :param alignment_provided: alignment preference specified. Can be None.
+    :type alignment_provided: int
+    :param default_alignment: alignment to use if `alignment_provided` is None
+    :type default_alignment: int
+    :return: alignment to use
+    :rtype: int
+    """
+    if alignment_provided is not None:
+        if alignment_provided > default_alignment:
+            if alignment_provided % default_alignment == 0:
+                return default_alignment
+            raise Exception(f"Alignment {alignment_provided} exceeds the maximum supported of {default_alignment}.")
+        return alignment_provided
+    return default_alignment

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/datatypes.py ADDED Viewed

	@@ -0,0 +1,362 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utility functions for converting between frontend datatypes and CUTLASS datatypes
+"""
+import cutlass_cppgen
+from cutlass_library import (
+    DataTypeSize,
+    MathOperation,
+    MathInstruction
+)
+from cutlass_cppgen.backend.library import (
+    TileDescription,
+)
+bfloat16_available = None
+cupy_available = None
+numpy_available = None
+torch_available = None
+_library_to_cupy_dict = None
+_library_to_numpy_dict = None
+_library_to_torch_dict = None
+_torch_to_library_dict = None
+def is_numpy_available():
+    global numpy_available, _library_to_numpy_dict
+    if numpy_available is None:
+        try:
+            import numpy as np
+            numpy_available = True
+            _library_to_numpy_dict = {
+                cutlass_cppgen.DataType.f16: np.float16,
+                cutlass_cppgen.DataType.f32: np.float32,
+                cutlass_cppgen.DataType.f64: np.float64,
+                cutlass_cppgen.DataType.s8: np.int8,
+                cutlass_cppgen.DataType.s32: np.int32,
+            }
+        except ImportError:
+            numpy_available = False
+            _library_to_numpy_dict = {}
+    return numpy_available
+def is_numpy_tensor(inp) -> bool:
+    if is_numpy_available():
+        import numpy as np
+        return isinstance(inp, np.ndarray)
+    return False
+def numpy_library_type(inp) -> cutlass_cppgen.DataType:
+    if is_numpy_available():
+        import numpy as np
+        if inp == np.float16:
+            return cutlass_cppgen.DataType.f16
+        elif inp == np.float32:
+            return cutlass_cppgen.DataType.f32
+        elif inp == np.float64:
+            return cutlass_cppgen.DataType.f64
+        elif inp == np.int8:
+            return cutlass_cppgen.DataType.s8
+        elif inp == np.int32:
+            return cutlass_cppgen.DataType.s32
+    return None
+def numpy_type(inp):
+    return _library_to_numpy_dict.get(inp, None)
+def is_cupy_available():
+    global cupy_available
+    if cupy_available is None:
+        try:
+            import cupy as cp
+            cupy_available = True
+            _library_to_cupy_dict = {
+                cutlass_cppgen.DataType.f16: cp.float16,
+                cutlass_cppgen.DataType.f32: cp.float32,
+                cutlass_cppgen.DataType.f64: cp.float64,
+                cutlass_cppgen.DataType.s8: cp.int8,
+                cutlass_cppgen.DataType.s32: cp.int32,
+            }
+        except ImportError:
+            cupy_available = False
+            _library_to_cupy_dict = {}
+    return cupy_available
+def is_cupy_tensor(inp) -> bool:
+    if is_cupy_available():
+        import cupy as cp
+        return isinstance(inp, cp.ndarray)
+    return False
+def cupy_library_type(inp) -> cutlass_cppgen.DataType:
+    if is_cupy_available():
+        import cupy as cp
+        if inp == cp.float16:
+            return cutlass_cppgen.DataType.f16
+        elif inp == cp.float32:
+            return cutlass_cppgen.DataType.f32
+        elif inp == cp.float64:
+            return cutlass_cppgen.DataType.f64
+    return None
+def cupy_type(inp):
+    return _library_to_cupy_dict.get(inp, None)
+def is_torch_available():
+    global torch_available, _library_to_torch_dict, _torch_to_library_dict
+    if torch_available is None:
+        try:
+            import torch
+            torch_available = True
+            _torch_to_library_dict = {
+                torch.half: cutlass_cppgen.DataType.f16,
+                torch.float16: cutlass_cppgen.DataType.f16,
+                torch.bfloat16: cutlass_cppgen.DataType.bf16,
+                torch.float: cutlass_cppgen.DataType.f32,
+                torch.float32: cutlass_cppgen.DataType.f32,
+                torch.double: cutlass_cppgen.DataType.f64,
+                torch.float64: cutlass_cppgen.DataType.f64,
+                torch.int8: cutlass_cppgen.DataType.s8,
+                torch.int32: cutlass_cppgen.DataType.s32,
+                torch.uint8: cutlass_cppgen.DataType.u8,
+            }
+            _library_to_torch_dict = {
+                cutlass_cppgen.DataType.f16: torch.half,
+                cutlass_cppgen.DataType.f16: torch.float16,
+                cutlass_cppgen.DataType.bf16: torch.bfloat16,
+                cutlass_cppgen.DataType.f32: torch.float,
+                cutlass_cppgen.DataType.f32: torch.float32,
+                cutlass_cppgen.DataType.f64: torch.double,
+                cutlass_cppgen.DataType.f64: torch.float64,
+                cutlass_cppgen.DataType.s8: torch.int8,
+                cutlass_cppgen.DataType.s32: torch.int32,
+                cutlass_cppgen.DataType.u8: torch.uint8,
+            }
+            def possibly_add_type(torch_type_name, cutlass_type):
+                # Only try adding the type if the version of torch being used supports it
+                if hasattr(torch, torch_type_name):
+                    torch_type = getattr(torch, torch_type_name)
+                    _torch_to_library_dict[torch_type] = cutlass_type
+                    _library_to_torch_dict[cutlass_type] = torch_type
+            possibly_add_type("float8_e4m3fn", cutlass_cppgen.DataType.e4m3)
+            possibly_add_type("float8_e5m2", cutlass_cppgen.DataType.e5m2)
+        except ImportError:
+            torch_available = False
+            _torch_to_library_dict = {}
+            _library_to_torch_dict = {}
+    return torch_available
+def is_torch_tensor(inp) -> bool:
+    if is_torch_available():
+        import torch
+        return isinstance(inp, torch.Tensor)
+    return False
+def torch_library_type(inp) -> cutlass_cppgen.DataType:
+    return _torch_to_library_dict.get(inp, None)
+def torch_type(inp):
+    return _library_to_torch_dict.get(inp, None)
+def is_bfloat16_available():
+    global bfloat16_available
+    if bfloat16_available is None:
+        try:
+            import bfloat16
+            bfloat16_available = True
+        except ImportError:
+            bfloat16_available = False
+    return bfloat16_available
+def bfloat16_library_type(inp) -> cutlass_cppgen.DataType:
+    if is_bfloat16_available():
+        import bfloat16
+        if inp == bfloat16.bfloat16:
+            return cutlass_cppgen.DataType.bf16
+def bfloat16_type(inp):
+    if is_bfloat16_available():
+        import bfloat16
+        if inp == cutlass_cppgen.DataType.bf16:
+            return bfloat16.bfloat16
+def library_type(inp):
+    if inp in DataTypeSize:
+        return inp
+    for cvt_fn in [
+        bfloat16_library_type,
+        cupy_library_type,
+        numpy_library_type,
+        torch_library_type,
+    ]:
+        out = cvt_fn(inp)
+        if out is not None:
+            return out
+    raise Exception(f"No available conversion from type {inp} to a library type.")
+def _tensor_from_numpy(np_tensor):
+    dtype = library_type(np_tensor.dtype)
+    if np_tensor.flags.c_contiguous:
+        layout = cutlass_cppgen.LayoutType.RowMajor
+    elif np_tensor.flags.f_contiguous:
+        layout = cutlass_cppgen.LayoutType.ColumnMajor
+    return (dtype, layout)
+def _tensor_from_torch(pt_tensor):
+    dtype = library_type(pt_tensor.dtype)
+    return (dtype, cutlass_cppgen.LayoutType.RowMajor)
+def get_datatype_and_layout(tensor):
+    if (is_numpy_tensor(tensor) or is_cupy_tensor(tensor)):
+        return _tensor_from_numpy(tensor)
+    elif is_torch_tensor(tensor):
+        return _tensor_from_torch(tensor)
+    elif isinstance(tensor, float) or isinstance(tensor, int):
+        return (cutlass_cppgen.DataType.f32, cutlass_cppgen.LayoutType.RowMajor)
+    else:
+        raise Exception(f"Unable to convert tensor of type {type(tensor)} to Python-bound CUTLASS datatype and layout.")
+def get_tensor_shape(tensor, op="GEMM"):
+    if (is_numpy_tensor(tensor) or is_cupy_tensor(tensor)):
+        return tensor.shape
+    elif is_torch_tensor(tensor):
+        size = tensor.size()
+        if op == "CONV":
+            # PyTorch Tensors have shape NCHW
+            return (size[0], size[2], size[3], size[1])
+        else:
+            return tuple(tensor.size())
+    elif isinstance(tensor, float) or isinstance(tensor, int):
+        return (1,)
+    else:
+        raise Exception(f"Unable to convert tensor of type {type(tensor)} to Python-bound CUTLASS datatype and layout.")
+_math_operation_value_map = {x.value: x for x in MathOperation}
+def backend_math_operation(math_op: MathOperation):
+    if math_op.value not in _math_operation_value_map.keys():
+        raise Exception(f"Unable to convert math operation of type {math_op} to backend math operation.")
+    return _math_operation_value_map[math_op.value]
+def construct_backend_td(td: cutlass_cppgen.TileDescription,
+                         kernel_schedule: cutlass_cppgen.KernelScheduleType,
+                         epilogue_schedule: cutlass_cppgen.EpilogueScheduleType,
+                         tile_scheduler: cutlass_cppgen.TileSchedulerType) -> TileDescription:
+    mi = td.math_instruction
+    backend_mi = MathInstruction(
+        mi.instruction_shape,
+        mi.element_a,
+        mi.element_b,
+        mi.element_accumulator,
+        mi.opcode_class,
+        backend_math_operation(mi.math_operation)
+    )
+    cluster_shape = td.cluster_shape if hasattr(td, "cluster_shape") else [1, 1, 1]
+    return TileDescription(td.threadblock_shape, td.stages, td.warp_count,
+                           backend_mi, cluster_shape, kernel_schedule, epilogue_schedule, tile_scheduler)
+def td_from_profiler_op(op) -> TileDescription:
+    """
+    Converts the profiler's TileDescription in ``op`` into the backend TileDescription
+    :param op: profiler Operation
+    :returns: backend TileDescription
+    :rtype: cutlass_cppgen.backend.TileDescription
+    """
+    kschedule = op.kernel_schedule if hasattr(op, 'kernel_schedule') else None
+    eschedule = op.epilogue_schedule if hasattr(op, 'epilogue_schedule') else None
+    tschedule = op.tile_scheduler if hasattr(op, 'tile_scheduler') else None
+    return construct_backend_td(op.tile_description, kschedule, eschedule, tschedule)
+def td_from_profiler_td(td: TileDescription) -> TileDescription:
+    """
+    Converts the profiler's TileDescription into the backend TileDescription
+    :param td: profiler TileDescription
+    :type td: cutlass_cppgen.TileDescription
+    :returns: backend TileDescription
+    :rtype: cutlass_cppgen.backend.TileDescription
+    """
+    return construct_backend_td(td, kernel_schedule=None, epilogue_schedule=None, tile_scheduler=None)
+def to_camel_case(snake_str):
+    return "".join(x.capitalize() for x in snake_str.lower().split("_"))
+def getattr_enum(obj, attr_name):
+    # The attr_name is under the snake_case
+    camel_attr = to_camel_case(attr_name)
+    if hasattr(obj, camel_attr):
+        return getattr(obj, camel_attr)
+    else:
+        raise Exception(f"Invalid option: {attr_name}")

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/lazy_import.py ADDED Viewed

	@@ -0,0 +1,41 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+import importlib
+from typing import Any
+def lazy_import(mod_name: str) -> Any:
+    class Lazy:
+        def __getattr__(self, name:str) -> Any:
+            module = importlib.import_module(mod_name)
+            return getattr(module, name)
+    return Lazy()

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/profiler.py ADDED Viewed

	@@ -0,0 +1,196 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Profiler based on the cuda events
+"""
+import re
+import subprocess
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+cudart =  lazy_import("cuda.cudart")
+import numpy as np
+from cutlass_cppgen import CUTLASS_PATH
+from cutlass_cppgen.backend.library import DataTypeSize
+from cutlass_cppgen.op.op import OperationBase
+from cutlass_cppgen.shape import GemmCoord
+from cutlass_cppgen.utils.datatypes import is_numpy_tensor
+class GpuTimer:
+    def __init__(self) -> None:
+        self.events = [
+            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
+            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
+        ]
+    def start(self, stream=None):
+        if not stream:
+            stream = cuda.CUstream(0)
+        (err,) = cuda.cuEventRecord(self.events[0], stream)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+    def stop(self, stream=None):
+        if not stream:
+            stream = cuda.CUstream(0)
+        (err,) = cuda.cuEventRecord(self.events[1], stream)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+        pass
+    def stop_and_wait(self, stream=None):
+        if not stream:
+            stream = cuda.CUstream(0)
+        self.stop(stream)
+        if stream:
+            (err,) = cuda.cuStreamSynchronize(stream)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError(f"CUDA Error {str(err)}")
+        else:
+            (err,) = cudart.cudaDeviceSynchronize()
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError(f"CUDA Error {str(err)}")
+    def duration(self, iterations=1):
+        err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+        return duration / float(iterations)
+class CUDAEventProfiler:
+    def __init__(self, op: OperationBase, warmup_iterations: int=500, iterations: int=500, *args, **kwargs) -> None:
+        self.arguments = op.run(*args, **kwargs)
+        self.operation = op.operation
+        self.warmup_iterations = warmup_iterations
+        self.iterations = iterations
+        self.timer = GpuTimer()
+    #
+    # Cutlass Python Interface Profiler
+    #
+    def __call__(self):
+        for _ in range(self.warmup_iterations):
+            self.operation.run(self.arguments)
+        self.timer.start()
+        for _ in range(self.iterations):
+            self.operation.run(self.arguments)
+        self.timer.stop_and_wait()
+        runtime = self.timer.duration(self.iterations)
+        return runtime
+    #
+    # CUTLASS Profiler
+    #
+    def run_cutlass_profiler(self):
+        alpha = 1.0
+        beta = 1.0
+        profiler_path = CUTLASS_PATH + "/build/tools/profiler/cutlass_profiler"
+        kernel_name = self.operation.procedural_name()
+        verification_providers = "device"
+        provider = "cutlass"
+        problem_size = self.arguments.problem_size
+        if "cutlass3x" in kernel_name:
+            # cutlass3x generator only have column-major output
+            layout_name = self.operation.layout_name_3x()
+            if layout_name[-1] == "t":
+                new_layout_name = "".join(["n" for l in layout_name if l == "t" or "t"])
+                problem_size = GemmCoord(problem_size.n, problem_size.m, problem_size.k)
+                kernel_name = kernel_name.replace(layout_name, new_layout_name)
+        batch_count = self.arguments.batch_count
+        cmd = f"{profiler_path} --kernels={kernel_name} --verification-providers={verification_providers} " \
+              f"--providers={provider} --m={problem_size.m()} --n={problem_size.n()} --k={problem_size.k()} " \
+              f"--batch_count={batch_count} --alpha={alpha} --beta={beta} "\
+              f"--warmup-iterations={self.warmup_iterations} --profiling-iterations={self.iterations}"
+        result = subprocess.getoutput(cmd)
+        m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
+        runtime = float(m.group("runtime"))
+        m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
+        bytes = int(m.group("bytes"))
+        m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
+        flops = int(m.group("flops"))
+        # check if the problem size matches
+        assert bytes == self.bytes(problem_size, batch_count, beta)
+        assert flops == self.flops(problem_size, batch_count, beta)
+        return runtime
+    def bytes(self, problem_size, batch_count=1, beta=0.0):
+        m = problem_size.m()
+        n = problem_size.n()
+        k = problem_size.k()
+        bytes = (
+            (DataTypeSize[self.operation.A.element] * m // 8) * k
+            + (DataTypeSize[self.operation.B.element] * n // 8) * k
+            + (DataTypeSize[self.operation.C.element] * m // 8) * n
+        )
+        if beta != 0:
+            bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
+        bytes *= batch_count
+        return bytes
+    def flops(self, problem_size, batch_count=1, beta=0.0):
+        m = problem_size.m()
+        n = problem_size.n()
+        k = problem_size.k()
+        flops_ = (m * n * k) * 2 * batch_count
+        if beta != 0:
+            flops_ += m * n * batch_count * 2
+        return flops_

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/__init__.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+import os
+import sys
+from . import conv2d_operation
+from . import conv3d_operation
+from . import emit_kernel_listing
+from . import gemm_operation
+if '-m' not in sys.argv:
+    # Do not import generator when running python -m cutlass_library.generator to
+    # avoid double-import warnings
+    from . import generator
+from . import library
+from . import manifest
+from . import rank_2k_operation
+from . import rank_k_operation
+from . import symm_operation
+from . import trmm_operation
+# Make enum types from library.py accessible via cutlass_library.*
+from .library import *
+# Set up `source` to point to the path containing the CUTLASS source.
+# Check first if the path contains a `source` subdirectory -- this will
+# be the case when the package has been installed via pip. Otherwise,
+# default to the root of CUTLASS.
+install_source_path = os.path.join(__path__[0], 'source')
+if os.path.isdir(install_source_path):
+    source_path = install_source_path
+else:
+    source_path = os.path.join(__path__[0], '../..')

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/conv2d_operation.py ADDED Viewed

	@@ -0,0 +1,621 @@

+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for emitting Conv2d kernels
+"""
+import enum
+import logging
+import os.path
+import shutil
+from string import Template
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.conv3x_emitter import EmitConv3xInstance, EmitConv3xIncludes
+except ImportError:
+  from library import *
+  from conv3x_emitter import EmitConv3xInstance, EmitConv3xIncludes
+_LOGGER = logging.getLogger(__name__)
+###################################################################################################
+#
+class Conv2dOperation:
+  #
+  def __init__(self, conv_kind, iterator_algorithm, arch, tile_description, A, B, C, element_epilogue, \
+    stride_support, epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity1, \
+    group_mode = GroupMode.NoneGroup):
+    self.operation_kind = OperationKind.Conv2d
+    self.arch = arch
+    self.tile_description = tile_description
+    self.conv_kind = conv_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.iterator_algorithm = iterator_algorithm
+    self.stride_support = stride_support
+    self.swizzling_functor = swizzling_functor
+    self.group_mode = group_mode
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian
+      ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+    if self.is_complex():
+      return get_complex_from_real(accum)
+    return accum
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+    intermediate_type = ''
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.accumulator_type():
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    else:
+      inst_shape = ''
+    return "%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \
+      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${element_c}_${core_name}_${element_a}"
+    elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${core_name}_${element_a}"
+    else:
+      extended_name = "${core_name}"
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+    return extended_name
+  #
+  def layout_name(self):
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    threadblock = self.tile_description.procedural_name()
+    # grouped conv
+    if self.group_mode != GroupMode.NoneGroup:
+      group_conv_name = f"{GroupModeNames[self.group_mode]}_"
+    else:
+      group_conv_name = ""
+    if self.stride_support == StrideSupport.Unity and self.conv_kind == ConvKind.Dgrad:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride_${group_conv_name}align${alignment}"
+    else:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${group_conv_name}align${alignment}"
+    return SubstituteTemplate(
+      configuration_name,
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'alignment': "%d" % self.A.alignment,
+        'group_conv_name': group_conv_name
+      }
+    )
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.configuration_name()
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+class EmitConv2dInstance:
+  def __init__(self):
+    # Emitter for CUTLASS 3 convolution operations
+    self.conv3x_emitter = EmitConv3xInstance()
+    self.template = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator},
+    ${iterator_algorithm},
+    ${stride_support},
+    ${align_a},
+    ${align_b}
+  >::Kernel;
+"""
+    self.template_group_conv = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultConv2dGroup${conv_kind_name}<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator},
+    ${group_mode},
+    ${iterator_algorithm},
+    ${stride_support},
+    ${align_a},
+    ${align_b}
+  >::Kernel;
+"""
+    self.template_depthwise_direct_conv = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConv${conv_kind_name}<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::conv::TensorNHWCShape<${threadblock_output_shape_n}, ${threadblock_output_shape_p}, ${threadblock_output_shape_q}, ${groups_per_cta}>,
+    cutlass::MatrixShape<${filter_shape_r}, ${filter_shape_s}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue},
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
+          1,
+          ${threadblock_output_shape_n},
+          ${threadblock_output_shape_p},
+          ${threadblock_output_shape_q}>,
+    ${stages},
+    ${math_operator},
+    ${iterator_algorithm},
+    ${stride_support},
+    cutlass::MatrixShape<${stride_r}, ${stride_s}>,
+    cutlass::MatrixShape<${dilation_r}, ${dilation_s}>
+  >::Kernel;
+"""
+  def arch_number_to_type(self, arch: int):
+    return f"cutlass::arch::Sm{arch}"
+  def emit(self, operation):
+    _LOGGER.debug("*** EmitConv2dInstance::emit")
+    _LOGGER.debug("***   operation: procedural_name()=" + operation.procedural_name())
+    if hasattr(operation, 'is_3x') and operation.is_3x:
+      _LOGGER.debug("***   CUTLASS 3 operation")
+      return self.conv3x_emitter.emit(operation)
+    _LOGGER.debug("***   CUTLASS 2 operation")
+    warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)]
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+    values = {
+      'operation_name': operation.procedural_name(),
+      'conv_kind': ConvKindTag[operation.conv_kind],
+      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
+      'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
+      'stride_support': StrideSupportTag[operation.stride_support],
+      'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else \
+      MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+    }
+    if operation.group_mode == GroupMode.NoneGroup:
+      _LOGGER.debug("***   group_mode=NoneGroup")
+      return SubstituteTemplate(self.template, values)
+    elif operation.group_mode == GroupMode.Depthwise:
+      _LOGGER.debug("***   group_mode=Depthwise")
+      values['group_mode'] = GroupModeTag[operation.group_mode]
+      # Setup other template params
+      values['threadblock_output_shape_n'] = str(operation.tile_description.threadblock_output_shape[0])
+      values['threadblock_output_shape_p'] = str(operation.tile_description.threadblock_output_shape[1])
+      values['threadblock_output_shape_q'] = str(operation.tile_description.threadblock_output_shape[2])
+      values['groups_per_cta'] = str(operation.tile_description.threadblock_output_shape[3])
+      values['filter_shape_r'] = str(operation.tile_description.filter_shape[0])
+      values['filter_shape_s'] = str(operation.tile_description.filter_shape[1])
+      values['stride_r'] = str(operation.tile_description.stride[0])
+      values['stride_s'] = str(operation.tile_description.stride[1])
+      values['dilation_r'] = str(operation.tile_description.dilation[0])
+      values['dilation_s'] = str(operation.tile_description.dilation[1])
+      return SubstituteTemplate(self.template_depthwise_direct_conv, values)
+    else:
+      _LOGGER.debug("***   group_mode=" + GroupModeTag[operation.group_mode])
+      values['group_mode'] = GroupModeTag[operation.group_mode]
+      return SubstituteTemplate(self.template_group_conv, values)
+###################################################################################################
+#
+# Generator functions for all layouts
+#
+###################################################################################################
+#
+def GenerateConv2dTensorOp(manifest, tile_descriptions, min_cc, align = 128):
+  _LOGGER.debug("*** GenerateConv2dTensorOp")
+  for tile in tile_descriptions:
+    for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
+      if conv_kind == ConvKind.Fprop or (tile.math_instruction.element_accumulator in [DataType.f16, DataType.f32]):
+        #
+        output_types = [tile.math_instruction.element_a, tile.math_instruction.element_accumulator] \
+          if DataTypeSize[tile.math_instruction.element_accumulator] == 32 \
+          else [tile.math_instruction.element_accumulator,]
+        for output_type in output_types:
+          A = TensorDescription(tile.math_instruction.element_a, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_a]))
+          B = TensorDescription(tile.math_instruction.element_b, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_b]))
+          C = TensorDescription(output_type,  LayoutType.TensorNHWC, max(1, int(align / DataTypeSize[output_type])))
+          manifest.append(Conv2dOperation(conv_kind, min_cc, tile, A, B, C, tile.math_instruction.element_accumulator))
+class EmitConv2dIncludes:
+  '''Emit includes that are specific to the operation.'''
+  def __init__(self):
+    self.includes = ['conv2d_operation.h']
+    self.emitter_3x = EmitConv3xIncludes()
+  def operation_is_3x(self, operation) -> bool:
+    """Whether operation is a CUTLASS 3 convolution (as opposed to CUTLASS 2)"""
+    return hasattr(operation, 'is_3x') and operation.is_3x
+  def emit(self, operation) -> str:
+    if self.operation_is_3x(operation):
+      return self.emitter_3x.emit(operation)
+    return '\n'.join(f"#include \"{incl}\"" for incl in self.includes) + \
+      "\n\n///////////////////////////////////////////////////////////////////////////////////////////////////"
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+class EmitConv2dConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name)
+    self.instance_emitter = EmitConv2dInstance()
+    self.includes_emitter = EmitConv2dIncludes()
+    self.header_template = """
+/*
+  Generated by conv2d_operation.py - Do not edit.
+*/
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "library_internal.h"
+"""
+    self.instance_template = """
+${stub_begin}
+${operation_instance}
+// Derived class
+struct ${operation_name} :
+  public ${operation_name}_base { };
+${stub_end}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+    self.configuration_header = """
+namespace cutlass {
+namespace library {
+// Initialize all instances
+void initialize_${configuration_name}(Manifest &manifest) {
+"""
+    self.configuration_instance = """${stub_begin}
+  using Operation_${operation_name} = cutlass::conv::device::${kernel_name}<
+    ${operation_name}>;
+  manifest.append(new cutlass::library::${operation_wrapper}<
+      Operation_${operation_name}
+    >(
+      "${operation_name}"
+    ));
+${stub_end}
+"""
+    self.configuration_epilogue = "}\n"
+    self.epilogue_template = """
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace library
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+  def operation_is_3x(self, operation):
+    """Whether operation is a CUTLASS 3 convolution (as opposed to CUTLASS 2)"""
+    return hasattr(operation, 'is_3x') and operation.is_3x
+  def __enter__(self):
+    """
+    Open the configuration_file, and write the "header" C++ code to it.
+    The "header" consists of a comment (that this is generated code,
+    so it should not be edited), and includes that are common
+    to all kinds of kernels.
+    """
+    _LOGGER.debug('*** EmitConv2dConfigurationLibrary::__enter__')
+    _LOGGER.debug('***   configuration_path (file to write): ' +
+                  str(self.configuration_path))
+    _LOGGER.debug('***   configuration_name: ' + self.configuration_name)
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(SubstituteTemplate(self.header_template, {
+      'configuration_name': self.configuration_name
+      }))
+    self.operations = []
+    return self
+  def emit(self, operation):
+    """
+    Write three pieces of C++ code to the configuration_file
+    (that was opened by the __enter__ method above):
+    1. the header includes that are specific to the operation
+       (CUTLASS 2 vs. CUTLASS 3);
+    2. the "operation instance" (a "using" declaration ending in "_base"); and
+    3. the "operation name" (declaration and definition of a derived class
+       of the above operation instance).
+    The "using" declaration turns a C++ class name, possibly namespace-qualified,
+    possibly also with angle brackets, into a C-style, easily demangled identifier.
+    """
+    _LOGGER.debug('*** EmitConv2dConfigurationLibrary::emit')
+    _LOGGER.debug('***   operation.procedural_name(): ' + operation.procedural_name())
+    self.operations.append(operation)
+    self.configuration_file.write(self.includes_emitter.emit(operation))
+    stub_begin = ''
+    stub_end = ''
+    # It can be useful to stub (comment) out instantiations for testing.
+    # In this case, one need only set is_stub to True.
+    is_stub = False
+    if is_stub:
+      stub_begin = "// STUB for now\n#if 0"
+      stub_end = '#endif // 0'
+    self.configuration_file.write(Template(self.instance_template).substitute({
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'operation_instance': self.instance_emitter.emit(operation),
+      'stub_begin': stub_begin,
+      'stub_end': stub_end
+      }))
+  def __exit__(self, exception_type, exception_value, traceback):
+    """
+    Write the rest of the C++ code to the configuration_file, and close the file.
+    The "rest of the C++ code" has the following components.
+    1. Configuration header: Open the namespace(s), and open the definition
+       of the "initialize_${configuration_name}" registration function
+       that registers the operation with the Manifest.
+       ("Registration" helps turn C++ compile-time polymorphism
+       (via template parameters) into a run-time choice of parameters.)
+    2. Configuration instance: In the body of the registration function,
+       make a "using" declaration Operation_${operation_name} for the
+       operation type (which uses operation_name as its template argument).
+       Then, tell the manifest about the operation via a "manifest.append" call.
+       The argument of the call is a new instance of
+       "SomethingOperation<Operation_${operation_name}>"
+       (replace Something with a specific name).
+    3. Configuration epilogue: Close the definition of the registration function.
+    4. Epilogue template: Close the namespace(s).
+    """
+    _LOGGER.debug('*** EmitConv2dConfigurationLibrary::__exit__')
+    _LOGGER.debug('***   configuration_path (file to write): ' +
+                  str(self.configuration_path))
+    _LOGGER.debug('***   configuration_name: ' + self.configuration_name)
+    self.configuration_file.write(SubstituteTemplate(self.configuration_header, {
+      'configuration_name': self.configuration_name
+      }))
+    for operation in self.operations:
+      stub_begin = ''
+      stub_end = ''
+      # It can be useful to stub (comment) out instantiations for testing.
+      # In this case, one need only set is_stub to True.
+      is_stub = False
+      if is_stub:
+        stub_begin = "// STUB for now\n#if 0"
+        stub_end = "#endif // 0"
+      if operation.group_mode == GroupMode.Depthwise:
+        kernel_name = 'DirectConvolution'
+        operation_wrapper = 'DirectConv2dOperation'
+      else:
+        kernel_name = 'ImplicitGemmConvolution'
+        operation_wrapper = 'Conv2dOperation'
+      if self.operation_is_3x(operation):
+        kernel_name = 'ConvUniversalAdapter'
+        operation_wrapper = 'ConvOperation3x'
+      self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
+        'configuration_name': self.configuration_name,
+        'operation_name': operation.procedural_name(),
+        'kernel_name': kernel_name,
+        'operation_wrapper': operation_wrapper,
+        'stub_begin': stub_begin,
+        'stub_end': stub_end
+      }))
+    self.configuration_file.write(self.configuration_epilogue)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+###################################################################################################
+###################################################################################################

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/conv3d_operation.py ADDED Viewed

	@@ -0,0 +1,482 @@

+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for emitting Conv3d kernels
+"""
+import enum
+import logging
+import os.path
+import shutil
+from string import Template
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.conv3x_emitter import EmitConv3xInstance, EmitConv3xIncludes
+except ImportError:
+  from library import *
+  from conv3x_emitter import EmitConv3xInstance, EmitConv3xIncludes
+_LOGGER = logging.getLogger(__name__)
+###################################################################################################
+#
+class Conv3dOperation:
+  #
+  def __init__(self, conv_kind, iterator_algorithm, arch, tile_description, A, B, C, element_epilogue, \
+    stride_support, epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
+    self.operation_kind = OperationKind.Conv3d
+    self.arch = arch
+    self.tile_description = tile_description
+    self.conv_kind = conv_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.iterator_algorithm = iterator_algorithm
+    self.stride_support = stride_support
+    self.swizzling_functor = swizzling_functor
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+    intermediate_type = ''
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    else:
+      inst_shape = ''
+    return "%s%s%s%s3d_%s" % (ShortDataTypeNames[self.tile_description.math_instruction.element_accumulator], \
+      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${element_c}_${core_name}_${element_a}"
+    elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${core_name}_${element_a}"
+    else:
+      extended_name = "${core_name}"
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+    return extended_name
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    threadblock = "%dx%d_%dx%d" % (
+      self.tile_description.threadblock_shape[0],
+      self.tile_description.threadblock_shape[1],
+      self.tile_description.threadblock_shape[2],
+      self.tile_description.stages
+    )
+    if self.stride_support == StrideSupport.Unity:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_unity_stride"
+    else:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}"
+    return SubstituteTemplate(
+      configuration_name,
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+      }
+    )
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.configuration_name()
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+class EmitConv3dInstance:
+  def __init__(self):
+    # Emitter for CUTLASS 3 convolution operations
+    self.conv3x_emitter = EmitConv3xInstance()
+    self.template = """
+  // Conv3d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultConv3d${conv_kind_name}<
+    ${element_a},
+    cutlass::layout::TensorNDHWC,
+    ${element_b},
+    cutlass::layout::TensorNDHWC,
+    ${element_c},
+    cutlass::layout::TensorNDHWC,
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    cutlass::arch::OpMultiplyAdd,
+    ${iterator_algorithm},
+    ${stride_support}
+  >::Kernel;
+"""
+  def emit(self, operation):
+    _LOGGER.debug("*** EmitConv3dInstance::emit")
+    _LOGGER.debug("***   operation: procedural_name()=" + operation.procedural_name())
+    if hasattr(operation, 'is_3x') and operation.is_3x:
+      _LOGGER.debug("***   CUTLASS 3 operation")
+      return self.conv3x_emitter.emit(operation)
+    _LOGGER.debug("***   CUTLASS 2 operation")
+    warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)]
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+    values = {
+      'operation_name': operation.procedural_name(),
+      'conv_kind': ConvKindTag[operation.conv_kind],
+      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.tile_description.math_instruction.element_accumulator],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
+      'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
+      'stride_support': StrideSupportTag[operation.stride_support]
+    }
+    return SubstituteTemplate(self.template, values)
+###################################################################################################
+#
+# Generator functions for all layouts
+#
+###################################################################################################
+#
+def GenerateConv3dTensorOp(manifest, tile_descriptions, min_cc, align = 128):
+  for tile in tile_descriptions:
+    for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
+      if conv_kind == ConvKind.Fprop or (tile.math_instruction.element_accumulator in [DataType.f16, DataType.f32]):
+        #
+        output_types = [tile.math_instruction.element_a, tile.math_instruction.element_accumulator] \
+          if DataTypeSize[tile.math_instruction.element_accumulator] == 32 \
+          else [tile.math_instruction.element_accumulator,]
+        for output_type in output_types:
+          A = TensorDescription(tile.math_instruction.element_a, LayoutType.TensorNDHWC, int(align / DataTypeSize[tile.math_instruction.element_a]))
+          B = TensorDescription(tile.math_instruction.element_b, LayoutType.TensorNDHWC, int(align / DataTypeSize[tile.math_instruction.element_b]))
+          C = TensorDescription(output_type,  LayoutType.TensorNDHWC, max(1, int(align / DataTypeSize[output_type])))
+          manifest.append(Conv3dOperation(conv_kind, min_cc, tile, A, B, C, tile.math_instruction.element_accumulator))
+class EmitConv3dIncludes:
+  '''Emit includes that are specific to the operation.'''
+  def __init__(self):
+    self.includes = ['conv3d_operation.h']
+    self.emitter_3x = EmitConv3xIncludes()
+  def operation_is_3x(self, operation) -> bool:
+    """Whether operation is a CUTLASS 3 convolution (as opposed to CUTLASS 2)"""
+    return hasattr(operation, 'is_3x') and operation.is_3x
+  def emit(self, operation) -> str:
+    if self.operation_is_3x(operation):
+      return self.emitter_3x.emit(operation)
+    return '\n'.join(f"#include \"{incl}\"" for incl in self.includes) + \
+      "\n\n///////////////////////////////////////////////////////////////////////////////////////////////////"
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+class EmitConv3dConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name)
+    self.instance_emitter = EmitConv3dInstance()
+    self.includes_emitter = EmitConv3dIncludes()
+    self.header_template = """
+/*
+  Generated by conv3d_operation.py - Do not edit.
+*/
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "library_internal.h"
+"""
+    self.instance_template = """
+${stub_begin}
+${operation_instance}
+// Derived class
+struct ${operation_name} :
+  public ${operation_name}_base { };
+${stub_end}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+    self.configuration_header = """
+namespace cutlass {
+namespace library {
+// Initialize all instances
+void initialize_${configuration_name}(Manifest &manifest) {
+"""
+    self.configuration_instance = """${stub_begin}
+  using Operation_${operation_name} = cutlass::conv::device::${kernel_name}<
+    ${operation_name}>;
+  manifest.append(new cutlass::library::${operation_wrapper}<
+      Operation_${operation_name}
+    >(
+      "${operation_name}"
+    ));
+${stub_end}
+"""
+    self.configuration_epilogue = "}\n"
+    self.epilogue_template = """
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace library
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+  def operation_is_3x(self, operation):
+    """Whether operation is a CUTLASS 3 convolution (as opposed to CUTLASS 2)"""
+    return hasattr(operation, 'is_3x') and operation.is_3x
+  def __enter__(self):
+    """
+    Open the configuration_file, and write the "header" C++ code to it.
+    The "header" consists of a comment (that this is generated code,
+    so it should not be edited), and includes that are common
+    to both the CUTLASS 2 and the CUTLASS 3 cases.
+    """
+    _LOGGER.debug('*** EmitConv3dConfigurationLibrary::__enter__')
+    _LOGGER.debug('***   configuration_path (file to write): ' +
+                  str(self.configuration_path))
+    _LOGGER.debug('***   configuration_name: ' + self.configuration_name)
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(SubstituteTemplate(self.header_template, {
+      'configuration_name': self.configuration_name
+      }))
+    self.operations = []
+    return self
+  def emit(self, operation):
+    """
+    Write three pieces of C++ code to the configuration_file
+    (that was opened by the __enter__ method above):
+    1. the header includes that are specific to the operation
+       (CUTLASS 2 vs. CUTLASS 3);
+    2. the "operation instance" (a "using" declaration ending in "_base"); and
+    3. the "operation name" (declaration and definition of a derived class
+       of the above operation instance).
+    The "using" declaration turns a C++ class name, possibly namespace-qualified,
+    possibly also with angle brackets, into a C-style, easily demangled identifier.
+    """
+    _LOGGER.debug('*** EmitConv3dConfigurationLibrary::emit')
+    _LOGGER.debug('***   operation.procedural_name(): ' + operation.procedural_name())
+    self.operations.append(operation)
+    self.configuration_file.write(self.includes_emitter.emit(operation))
+    stub_begin = ''
+    stub_end = ''
+    # It can be useful to stub (comment) out instantiations for testing.
+    # In this case, one need only set is_stub to True.
+    is_stub = False
+    if is_stub:
+      stub_begin = "// STUB for now\n#if 0"
+      stub_end = '#endif // 0'
+    self.configuration_file.write(Template(self.instance_template).substitute({
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'operation_instance': self.instance_emitter.emit(operation),
+      'stub_begin': stub_begin,
+      'stub_end': stub_end
+      }))
+  def __exit__(self, exception_type, exception_value, traceback):
+    """
+    Write the rest of the C++ code to the configuration_file, and close the file.
+    The "rest of the C++ code" has the following components.
+    1. Configuration header: Open the namespace(s), and open the definition
+       of the "initialize_${configuration_name}" registration function
+       that registers the operation with the Manifest.
+       ("Registration" helps turn C++ compile-time polymorphism
+       (via template parameters) into a run-time choice of parameters.)
+    2. Configuration instance: In the body of the registration function,
+       make a "using" declaration Operation_${operation_name} for the
+       operation type (which uses operation_name as its template argument).
+       Then, tell the manifest about the operation via a "manifest.append" call.
+       The argument of the call is a new instance of
+       "SomethingOperation<Operation_${operation_name}>"
+       (replace Something with a specific name).
+    3. Configuration epilogue: Close the definition of the registration function.
+    4. Epilogue template: Close the namespace(s).
+    """
+    _LOGGER.debug('*** EmitConv3dConfigurationLibrary::__exit__')
+    _LOGGER.debug('***   configuration_path (file to write): ' +
+                  str(self.configuration_path))
+    _LOGGER.debug('***   configuration_name: ' + self.configuration_name)
+    self.configuration_file.write(SubstituteTemplate(self.configuration_header, {
+      'configuration_name': self.configuration_name
+      }))
+    for operation in self.operations:
+      stub_begin = ''
+      stub_end = ''
+      # It can be useful to stub (comment) out instantiations for testing.
+      # In this case, one need only set is_stub to True.
+      is_stub = False
+      if is_stub:
+        stub_begin = "// STUB for now\n#if 0"
+        stub_end = "#endif // 0"
+      kernel_name = 'ImplicitGemmConvolution'
+      operation_wrapper = 'Conv3dOperation'
+      if self.operation_is_3x(operation):
+        kernel_name = 'ConvUniversalAdapter'
+        operation_wrapper = 'ConvOperation3x'
+      self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
+        'configuration_name': self.configuration_name,
+        'operation_name': operation.procedural_name(),
+        'kernel_name': kernel_name,
+        'operation_wrapper': operation_wrapper,
+        'stub_begin': stub_begin,
+        'stub_end': stub_end
+      }))
+    self.configuration_file.write(self.configuration_epilogue)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+###################################################################################################
+###################################################################################################

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/conv3x_emitter.py ADDED Viewed

	@@ -0,0 +1,250 @@

+#################################################################################################
+#
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for emitting CUTLASS >= 3 convolution kernels
+"""
+import enum
+import os.path
+import shutil
+import logging
+from string import Template
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+_LOGGER = logging.getLogger(__name__)
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+class EmitConv3xInstance:
+  def __init__(self):
+    _LOGGER.debug("*** EmitConv3xInstance::__init__")
+    # Define epilogue type first, so that the mainloop type
+    # can use it with StageCountAutoCarveout.
+    self.template = """
+// CUTLASS >= 3 convolution ${conv_kind_name} kernel instance "${operation_name}"
+using ${operation_name}_epilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch},
+    ${opcode_class_epi},
+    ${mma_tile_shape},               // mma tile shape
+    ${cluster_shape},                // cluster shape
+    ${epi_tile_mn},
+    ${element_accumulator},
+    ${element_compute},
+    ${element_c}, ${layout_c}, 128 / cute::sizeof_bits_v<${element_c}>,
+    ${element_d}, ${layout_d}, 128 / cute::sizeof_bits_v<${element_d}>,
+    ${epilogue_schedule}
+    // , class FusionOpOrCallbacks = cutlass::epilogue::fusion::LinearCombination<ElementD,ElementCompute>
+  >::CollectiveOp;
+using ${operation_name}_mainloop =
+  typename cutlass::conv::collective::CollectiveBuilder<
+    ${arch},
+    ${opcode_class_main},
+    ${conv_kind},         // kFprop, kDgrad, or kWgrad
+    ${element_a}, ${layout_a}, 128 / cute::sizeof_bits_v<${element_a}>,
+    ${element_b}, ${layout_b}, 128 / cute::sizeof_bits_v<${element_b}>,
+    ${element_accumulator},
+    ${mma_tile_shape},        // mma tile shape
+    ${cluster_shape},         // cluster shape
+    ${stages},
+    ${kernel_schedule}
+  >::CollectiveOp;
+using ${operation_name}_problem_shape = cutlass::conv::ConvProblemShape<${conv_kind}, ${operation_name}_mainloop::NumSpatialDimensions>;
+// Unit tests call this "ConvKernel".
+// Conv operator ${operation_name}
+using ${operation_name}_base = cutlass::conv::kernel::ConvUniversal<
+    ${operation_name}_problem_shape,
+    ${operation_name}_mainloop,
+    ${operation_name}_epilogue,
+    ${tile_scheduler}
+  >;
+"""
+  def arch_number_to_type(self, arch: int) -> str:
+    return f"cutlass::arch::Sm{arch}"
+  def mma_tile_shape(self, operation, cta_m, cta_n, cta_k) -> str:
+    mma_m = cta_m
+    mma_n = cta_n
+    mma_k = cta_k
+    if operation.arch >= 100:
+      # MmaTileShape (mma_m, mma_n, mma_k) is passed to kernel mainloop where
+      # mma_m = cta_m for 1sm version and mma_m = cta_m * 2 for 2sm version.
+      # If schedule is auto and cluster size is static and cta_m % 64 == 0 and cluster_m % 2 == 0, 2sm kernel version is allocated,
+      # otherwise 1sm kernel is allocated.
+      cta_m_per_mma_instruction = 1
+      if "2sm" in operation.procedural_name() :
+        cta_m_per_mma_instruction = 2
+      elif "1sm" in operation.procedural_name() :
+        cta_m_per_mma_instruction = 1
+      elif operation.tile_description.cluster_shape[0] > 0 and operation.tile_description.cluster_shape[0] % 2 == 0 and cta_m % 64 == 0 :
+        cta_m_per_mma_instruction = 2
+      mma_m = cta_m * cta_m_per_mma_instruction
+    # For all three kinds of convolutions, the tile shape's K mode
+    # differs from GEMM in that needs to be wrapped in a Shape.
+    # For Wgrad convolutions specifically,
+    # the N tile shape also needs to be wrapped in a Shape.
+    m_template = 'cute::_${mma_m}'
+    if operation.conv_kind == ConvKind.Wgrad:
+      n_template = 'cute::Shape<cute::_${mma_n}>'
+    else:
+      n_template = 'cute::_${mma_n}'
+    k_template = 'cute::Shape<cute::_${mma_k}>'
+    mma_tile_shape_template = f'cute::Shape<{m_template}, {n_template}, {k_template}>'
+    values = {
+      'mma_m': mma_m,
+      'mma_n': mma_n,
+      'mma_k': mma_k
+    }
+    return Template(mma_tile_shape_template).substitute(values)
+  def cluster_shape(self, operation) -> str:
+    m_template = 'cute::_${cluster_shape_m}' if operation.tile_description.cluster_shape[0] > 0 else 'int(0)'
+    n_template = 'cute::_${cluster_shape_n}' if operation.tile_description.cluster_shape[1] > 0 else 'int(0)'
+    k_template = 'cute::_${cluster_shape_k}' if operation.tile_description.cluster_shape[2] > 0 else 'int(0)'
+    cluster_shape_template = f'cute::Shape<{m_template}, {n_template}, {k_template}>'
+    values = {
+      'cluster_shape_m': operation.tile_description.cluster_shape[0],
+      'cluster_shape_n': operation.tile_description.cluster_shape[1],
+      'cluster_shape_k': operation.tile_description.cluster_shape[2],
+    }
+    return Template(cluster_shape_template).substitute(values)
+  def stage_count(self, operation) -> str:
+    # stages == 0 tells builder to pick the number of stages automatically
+    namespace_prefix = 'cutlass::conv::collective::'
+    if operation.tile_description.stages > 0:
+      return f"{namespace_prefix}StageCount<{str(operation.tile_description.stages)}>"
+    else:
+      return f"{namespace_prefix}StageCountAutoCarveout<sizeof(typename {operation.procedural_name()}_epilogue::SharedStorage)>"
+  def emit(self, operation) -> str:
+    _LOGGER.debug("*** EmitConv3xInstance::emit")
+    _LOGGER.debug("***   operation: procedural_name()=" + operation.procedural_name())
+    # Identify the operation as CUTLASS 3 by its is_3x field
+    if (not hasattr(operation, 'is_3x')) or (not operation.is_3x):
+      raise RuntimeError("operation must be a CUTLASS 3 operation")
+    epi_tile_mn = "cutlass::epilogue::collective::EpilogueTileAuto"
+    opcode_class_main = OpcodeClassTag[operation.tile_description.math_instruction.opcode_class]
+    opcode_class_epi = opcode_class_main
+    tile_shape = operation.tile_description.tile_shape
+    cluster_m = operation.tile_description.cluster_shape[0]
+    cluster_n = operation.tile_description.cluster_shape[1]
+    cta_m, cta_n, cta_k = tile_shape
+    # account for static/dynamic cluster shapes
+    if operation.arch >= 100:
+      cta_m = cta_m // cluster_m if cluster_m > 0 else cta_m
+      cta_n = cta_n // cluster_n if cluster_n > 0 else cta_n
+    warp_count = operation.tile_description.warp_count
+    epilogue_schedule = EpilogueScheduleTag[operation.epilogue_schedule]
+    # KernelScheduleTag and TileSchedulerTag both hard-code the
+    # namespace qualification of KernelScheduleAuto as
+    # "cutlass::gemm::collective::" (unless the tag is 'void').
+    #
+    # For TileSchedulerTag, this namespace is fine, since CUTLASS 3
+    # convolutions use the same tile schedulers (from the same
+    # cutlass::gemm::collective namespace) as GEMMs.
+    kernel_schedule = KernelScheduleTag[operation.kernel_schedule].replace('gemm::', 'conv::')
+    tile_scheduler = TileSchedulerTag[operation.tile_scheduler]
+    opcode_class = OpcodeClassTag[operation.tile_description.math_instruction.opcode_class]
+    values = {
+      'operation_name': operation.procedural_name(),
+      'conv_kind':      ConvKindTag[operation.conv_kind],
+      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+      'element_a':      DataTypeTag[operation.A.element],
+      'layout_a':       LayoutTag[operation.A.layout],
+      'align_a':        int(operation.A.alignment),
+      'element_b':      DataTypeTag[operation.B.element],
+      'layout_b':       LayoutTag[operation.B.layout],
+      'align_b':        int(operation.B.alignment),
+      'element_c':      DataTypeTag[operation.C.element],
+      'layout_c':       LayoutTag[operation.C.layout],
+      'align_c':        int(operation.C.alignment),
+      'element_d':      DataTypeTag[operation.D.element],
+      'layout_d':       LayoutTag[operation.D.layout],
+      'align_d':        int(operation.D.alignment),
+      'element_accumulator':   DataTypeTag[operation.accumulator_type()],
+      'opcode_class':          opcode_class,
+      'arch':                  self.arch_number_to_type(operation.arch),
+      'mma_tile_shape':        self.mma_tile_shape(operation, cta_m, cta_n, cta_k),
+      'cluster_shape':         self.cluster_shape(operation),
+      'opcode_class_epi':      opcode_class_epi,
+      'opcode_class_main':     opcode_class_main,
+      'epi_tile_mn':           epi_tile_mn,
+      'stages':                self.stage_count(operation),
+      'kernel_schedule':       kernel_schedule,
+      'epilogue_schedule':     epilogue_schedule,
+      'tile_scheduler':        tile_scheduler,
+      'element_compute':       DataTypeTag[operation.element_compute]
+    }
+    return Template(self.template).substitute(values)
+class EmitConv3xIncludes:
+  def __init__(self):
+    _LOGGER.debug("*** EmitConv3xIncludes::__init__")
+    self.includes = ['conv_operation_3x.hpp',
+                     'cutlass/conv/device/conv_universal_adapter.hpp',
+                     'cutlass/conv/kernel/conv_universal.hpp',
+                     'cutlass/conv/collective/collective_builder.hpp',
+                     'cutlass/epilogue/collective/collective_builder.hpp']
+  def emit(self, operation) -> str:
+    _LOGGER.debug("*** EmitConv3xIncludes::emit")
+    return '\n'.join(f"#include \"{incl}\"" for incl in self.includes) + \
+      "\n\n///////////////////////////////////////////////////////////////////////////////////////////////////"

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/emit_kernel_listing.py ADDED Viewed

	@@ -0,0 +1,868 @@

+#################################################################################################
+#
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+#
+#
+# \brief Generates the CUTLASS kernel listing with kernel filtering
+#
+#
+###############################################################################
+# Example usage:
+# generator.py --operations all --generator-target kernel_listing \
+# --architectures "70;75;80" --kernels "*" --disable-cutlass-package-imports
+###############################################################################
+import collections
+import csv
+import json
+import math
+import os
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+audit_csv_fields = [
+  "KernelType", "KernelName", "Type_A", "Type_B", "Type_C", "Type_Acc", "Type_EpilogueScale", "Type_D", "Type_SFA", "Type_SFD",
+  "Layout_A", "Layout_B", "Layout_C", "Layout_D",
+  "Alignment_A", "Alignment_B", "Alignment_C", "Alignment_D",
+  "1SM/2SM",
+  "StreamK Enabled", "Support Runtime_Cluster_Shape", "Support Runtime_Input_Types",
+  "Test Counts"
+]
+audit_csv_runtime_fields = [
+  "KerneIndex", "KernelName",
+  "Inst_M", "Inst_N", "Inst_K", "Tile_M", "Tile_N", "Tile_K",
+  "Cluster_M", "Cluster_N", "Cluster_K", "Preferred_Cluster_M", "Preferred_Cluster_N", "Preferred_Cluster_K", "Fallback_Cluster_M", "Fallback_Cluster_N", "Fallback_Cluster_K",
+  "M", "N", "K", "L", "Alpha_val", "Beta_val",
+  "Runtime_Input_Types Enabled", "Runtime_Cluster_Shape Enabled"
+]
+def hash_cutlass_string(input_string):
+  mma_cluster_shape_pattern = r"_\d+x\d+x\d+"         # Matches MMA and Cluster shapes (e.g., '_128x128x256', '_0x0x1')
+  # Remove MMA and Cluster shapes (e.g., '_128x128x256', '_0x0x1')
+  output = re.sub(mma_cluster_shape_pattern, "", input_string)
+  return output
+def transform_hashed_string(hashed_kernel_name, runtime_datatype_a, runtime_datatype_b):
+  # Define a dictionary mapping the detected types to runtime values
+  datatype_map = {
+    'f4_f4': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f4_f6': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f4_f8': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f6_f4': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f6_f6': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f6_f8': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f8_f4': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f8_f6': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f8_f8': runtime_datatype_a + '_' + runtime_datatype_b,
+    'ue8m0xf4_ue8m0xf4': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue4m3xf4_ue4m3xf4': 'ue4m3x' + runtime_datatype_a + '_ue4m3x' + runtime_datatype_b,
+    'ue8m0xf4_ue8m0xf6': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf4_ue8m0xf8': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf6_ue8m0xf4': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf6_ue8m0xf6': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf8_ue8m0xf4': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf8_ue8m0xf6': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf8_ue8m0xf8': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+  }
+  # Regular expression to detect all the keys in datatype_map
+  pattern = re.compile(r'(' + '|'.join(map(re.escape, datatype_map.keys())) + r')')
+  # Replace detected patterns using the dictionary
+  updated_kernel_name = pattern.sub(lambda match: datatype_map[match.group(0)], hashed_kernel_name)
+  return updated_kernel_name
+# This helper function reports foundational kernel features: datatypes, layouts, alignment and stream-k.
+def get_kernel_features(operation, kernel_name,
+              dynamic_datatype, runtime_input_datatype):
+  numcta_inst = "2sm" if "2sm" in kernel_name else "1sm"
+  math_inst = operation.tile_description.math_instruction
+  if dynamic_datatype:
+      dtype_name_A = runtime_input_datatype[0]
+      dtype_name_B = runtime_input_datatype[1]
+  else:
+      dtype_name_A = DataTypeNames[operation.A.element]
+      dtype_name_B = DataTypeNames[operation.B.element]
+  layout_name_A = ShortLayoutTypeNames[operation.A.layout]
+  layout_name_B = ShortLayoutTypeNames[operation.B.layout]
+  layout_name_C = ShortLayoutTypeNames[operation.C.layout]
+  layout_name_D = ShortLayoutTypeNames[operation.D.layout]
+  scale_factor_D_type = operation.ScaleFactorD.element if hasattr(operation, "ScaleFactorD") else DataType.void
+  scale_factor_A_type = getattr(operation, "ScaleFactorA", DataType.void)
+  audit_vals = [
+          "BlockScaledGEMM" if math_inst.opcode_class == OpcodeClass.BlockScaledTensorOp else "GEMM",
+          kernel_name,
+          dtype_name_A,
+          dtype_name_B,
+          DataTypeNames[operation.C.element],
+          DataTypeNames[operation.tile_description.math_instruction.element_accumulator],
+          DataTypeNames[operation.element_epilogue],
+          DataTypeNames[operation.D.element],
+          DataTypeNames[scale_factor_D_type],
+          DataTypeNames[scale_factor_A_type],
+          layout_name_A,
+          layout_name_B,
+          layout_name_C,
+          layout_name_D,
+          str(operation.A.alignment),
+          str(operation.B.alignment),
+          str(operation.C.alignment),
+          str(operation.D.alignment),
+          numcta_inst,
+          "Y" if 'stream_k' in kernel_name else "N",
+  ]
+  return audit_vals
+# This helper function reports other performance-related kernel parameters and those can be specified at runtime: cluster_shape, instruction shap, m/n/k and alpha/beta.
+def get_kernel_params(operation, kernel_name, cluster_shape, fallback_cluster_shape, problem_shape, alpha, beta, dynamic_datatype, dynamic_cluster):
+  math_inst = operation.tile_description.math_instruction
+  audit_vals = [
+          str(math_inst.instruction_shape[0]),
+          str(math_inst.instruction_shape[1]),
+          str(math_inst.instruction_shape[2]),
+          str(operation.tile_description.threadblock_shape[0]),
+          str(operation.tile_description.threadblock_shape[1]),
+          str(operation.tile_description.threadblock_shape[2]),
+          str(operation.tile_description.cluster_shape[0]),
+          str(operation.tile_description.cluster_shape[1]),
+          str(operation.tile_description.cluster_shape[2]),
+          str(cluster_shape[0]),
+          str(cluster_shape[1]),
+          str(cluster_shape[2]),
+          str(fallback_cluster_shape[0]),
+          str(fallback_cluster_shape[1]),
+          str(fallback_cluster_shape[2]),
+          str(problem_shape[0]),
+          str(problem_shape[1]),
+          str(problem_shape[2]),
+          str(problem_shape[3]),
+          str(alpha),
+          str(beta),
+          "Y" if dynamic_datatype else "N",
+          "Y" if dynamic_cluster else "N",
+  ]
+  return audit_vals
+def _getSubOperationType(kernel):
+  if kernel.operation_kind == OperationKind.Gemm:
+      return GemmKindNames[kernel.gemm_kind]
+  elif kernel.operation_kind == OperationKind.Conv2d:
+    return "conv_" + ConvKindNames[kernel.conv_kind]
+  elif kernel.operation_kind == OperationKind.Syrk:
+    return "syrk_" + SyrkKindNames[kernel.syrk_kind]
+  elif kernel.operation_kind == OperationKind.Trmm:
+    return "trmm_" + TrmmKindNames[kernel.trmm_kind]
+  elif kernel.operation_kind == OperationKind.Symm:
+    return "symm_" + SymmKindNames[kernel.symm_kind]
+  else:
+    raise Exception("Unsupported kernel type")
+def _get_inst_shape(math_instruction):
+  return "".join(str(x) for x in math_instruction.instruction_shape)
+def _is_simt_inst(math_instruction):
+  return _get_inst_shape(math_instruction) in ["111","114"]
+def _getInstType(input_precision, accumulate_precision, math_instruction):
+  # inst_shape
+  inst_shape = _get_inst_shape(math_instruction)
+  # input precision
+  if input_precision == "fp32" and inst_shape != "111":
+    inp = "tf32"
+  else:
+    inp = input_precision
+  # Handle SIMT op types first
+  if _is_simt_inst(math_instruction):
+    simt_input_precision_to_inst = {
+      "fp32": "FFMA",
+      "fp64": "DFMA",
+      "fp16": "HFMA",
+      "int8": "IDP4A",
+    }
+    inst = simt_input_precision_to_inst[input_precision]
+  else: # Tensor op instructions
+    if accumulate_precision == "cf64":
+      fp64_acc_map = {
+        MathOperation.multiply_add_complex_gaussian : "gz",
+        MathOperation.multiply_add_complex          : "z",
+      }
+      acc = fp64_acc_map[math_instruction.math_operation]
+    else:
+      tensor_op_acc_map = {
+        "fp32" : "s",
+        "cf32" : "s",
+        "fp16" : "h",
+        "int32": "i",
+        "fp64" : "d",
+      }
+      acc = tensor_op_acc_map[accumulate_precision]
+    inst = "{}{}{}".format(acc, inst_shape, inp)
+  return inst
+# TODO: Computes FLOps/Bytes for GEMM - revisit for conv
+def _computeFlopsPerByte(operation, m, n, k, batch_count=1, beta=0.0, num_groups=1):
+  assert not (batch_count > 1 and num_groups > 1)
+  # TODO: adjust for sparsity
+  gmem_bytes = (
+    (DataTypeSize[operation.A.element] * m // 8) * k +
+    (DataTypeSize[operation.B.element] * n // 8) * k +
+    (DataTypeSize[operation.C.element] * m // 8) * n
+  )
+  # TODO: complex-valued support
+  flops = 2 * (m * n * k)
+  if bool(beta):
+    gmem_bytes += (DataTypeSize[operation.C.element] * m // 8) * n
+    flops += 2 * m * n
+  multiplier = max(batch_count, num_groups)
+  gmem_bytes *= multiplier
+  flops *= multiplier
+  return flops / gmem_bytes
+def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
+                              ):
+  # For functional testing, we prefer to run reference computing on device if any
+  reference_device_archs = ["100a", "103a"]
+  run_reference_on_device = True if arch in reference_device_archs and mode in ["functional_L0", "functional_L1"] else False
+  profiler_flags_for_verification = "device" if run_reference_on_device else "host"
+  # beta values for L0 and L1
+  # TODO: randomize beta values for wider coverage
+  beta_values = [0.5]
+  is_supported_arch = (arch in ["100a", "100f", "101a", "101f", "103a", "110a", "110f", "120a", "120f", "121a", "121f"])
+  is_runtime_datatype_enabled = mode == "functional_L0" and is_supported_arch
+  if (mode == "functional_L0") and is_supported_arch:
+    problem_waves = [0.5, 1.25, 2.5]
+    #
+    # Dense Gemm
+    #
+    sm100_mma_data_type_general = [
+      'gemm_f16_f16_f16_f16_f16',
+      'gemm_f16_f16_f16_void_f16',
+      #'gemm_f16_f16_f32_f16_f16',
+      'tf32gemm_f32_f32_f32_f32_f32',
+      'bf16gemm_f32_f32_f32_f32_f32',
+    ]
+    exclude_archs = arch not in ("103a")
+    if exclude_archs:
+      sm100_mma_data_type_general.append('gemm_s8_s8_s32_s8_s8')
+    sm100_mma_data_type_runtime_dtype = [
+      'gemm.*f4_f4_f32_f32_f32',
+      'gemm.*f6_f6_f32_f32_f32',
+      'gemm.*f8_f8_f32_f32_f32',
+    ]
+    sm100_mma_cluster_size = [
+      '8x1x1',
+      '4x4x1', '2x1x1',
+      '0x0x1' # dynamic cluster
+    ]
+    # Restrict to two layouts to reduce L0 build and test time.
+    sm100_mma_layouts = [
+      'tnt',
+      'ntn'
+    ]
+    # regex list must be in kernel procedural name order
+    sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_general, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
+    sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_general, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
+    sm100_mma_filter_regex_1sm_runtime = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_runtime_dtype, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
+    sm100_mma_filter_regex_2sm_runtime = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_runtime_dtype, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
+    #
+    # Block Scale Gemm
+    #
+    block_scaled_data_type = [
+      # runtime datatypes
+      'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_e5m2',
+      'gemm.*ue4m3xf4_ue4m3xf4_f32_f16_e5m2',
+      'gemm.*ue8m0xf4_ue8m0xf6_f32_f16_e5m2',
+      #'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_ue8m0xe2m1',
+      'gemm.*ue8m0xf6_ue8m0xf6_f32_f16_ue8m0xe3m2',
+    ]
+    block_scaled_tile_k = ['x128_', 'x256_']
+    sm103_block_scaled_data_type = [
+      'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_e5m2',
+      'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_ue8m0xe2m1',
+    ]
+    sm103_block_scaled_tile_k = ['x768_']
+    block_scaled_cluster_size = [
+      '4x4x1', '2x1x1',
+      '0x0x1' # dynamic cluster
+    ]
+    block_scaled_layouts = ['tnt']
+    # regex list must be in kernel procedural name order
+    block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_tile_k, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*"
+    block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_tile_k, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*"
+    sm103_block_scaled_prefetch_policy = ['tmapf']
+    sm103_block_scaled_filter_regex_1sm = "cutlass3x_sm103_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [sm103_block_scaled_data_type, sm103_block_scaled_tile_k, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*(" + "|".join(sm103_block_scaled_prefetch_policy) + ").*"
+    sm103_block_scaled_filter_regex_2sm = "cutlass3x_sm103_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [sm103_block_scaled_data_type, sm103_block_scaled_tile_k, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*(" + "|".join(sm103_block_scaled_prefetch_policy) + ").*"
+    if arch in ["100a", "100f"]:
+      kernel_filter = f"({sm100_mma_filter_regex_1sm})|" \
+                      f"({sm100_mma_filter_regex_2sm})|" \
+                      f"({sm100_mma_filter_regex_1sm_runtime})|" \
+                      f"({sm100_mma_filter_regex_2sm_runtime})|" \
+                      f"({block_scaled_filter_regex_1sm})|" \
+                      f"({block_scaled_filter_regex_2sm})"
+    elif arch in ["101a", "101f", "110a", "110f"]:
+      kernel_filter = f"({sm100_mma_filter_regex_1sm})|" \
+                      f"({sm100_mma_filter_regex_2sm})|" \
+                      f"({sm100_mma_filter_regex_1sm_runtime})|" \
+                      f"({sm100_mma_filter_regex_2sm_runtime})|" \
+                      f"({block_scaled_filter_regex_1sm})|" \
+                      f"({block_scaled_filter_regex_2sm})"
+    elif arch in ["103a"]:
+      kernel_filter = f"({sm100_mma_filter_regex_1sm})|" \
+                      f"({sm100_mma_filter_regex_2sm})|" \
+                      f"({sm100_mma_filter_regex_1sm_runtime})|" \
+                      f"({sm100_mma_filter_regex_2sm_runtime})|" \
+                      f"({block_scaled_filter_regex_1sm})|" \
+                      f"({block_scaled_filter_regex_2sm})|" \
+                      f"({sm103_block_scaled_filter_regex_1sm})|" \
+                      f"({sm103_block_scaled_filter_regex_2sm})"
+    elif arch in ["120a", "120f", "121a", "121f"]:
+      # blockscaled sm120_mma kernels
+      blockscaled_sm120_mma_kernel_cta_tiles = [
+        [ '128x128' ]
+      ]
+      # Restrict to two layouts to reduce L0 build and test time.
+      blockscaled_sm120_mma_layouts = [ 'tn' ]
+      filter_regex_blockscaled_sm120_mma = "cutlass3x_sm120_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [blockscaled_sm120_mma_kernel_cta_tiles[0], blockscaled_sm120_mma_layouts]]) + ").*"
+      problem_waves = [0.5, 1.25, 2.5]
+      kernel_filter = f"({filter_regex_blockscaled_sm120_mma})"
+    else:
+      error_message = "unsupported arch, only support sm100a, sm100f, sm101a, sm101f, sm110a, sm110f, sm103a, sm120a, sm120f, sm121a, sm121f"
+      raise Exception(error_message)
+  elif mode == "functional_L1":
+    sm100_mma_cluster_size = [
+                    '0x0x1' # dynamic cluster
+                     ]
+    # Restrict to two layouts to reduce L1 build and test time.
+    sm100_mma_layouts = ['tnt', 'ntn']
+    sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
+    sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
+    block_scaled_data_type = [
+      'ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2',
+      'ue8m0xe2m1_ue8m0xe2m3_f32_f16_e5m2',
+      'ue8m0xmx8s26_ue8m0xmx8s26_f32_f16_e5m2',
+      'ue8m0xe2m1_ue8m0xe2m1_f32_f16_ue8m0xe2m1',
+      'ue8m0xe2m3_ue8m0xe2m3_f32_f16_ue8m0xe3m2',
+    ]
+    sm103_block_scaled_data_type = [
+      'ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2',
+      'ue8m0xe2m1_ue8m0xe2m1_f32_f16_ue8m0xe2m1',
+    ]
+    block_scaled_cluster_size = ['0x0x1']
+    block_scaled_layouts = ['tnt']
+    # regex list must be in kernel procedural name order
+    block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*"
+    block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*"
+    sm103_block_scaled_filter_regex_1sm = "cutlass3x_sm103_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [sm103_block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*"
+    sm103_block_scaled_filter_regex_2sm = "cutlass3x_sm103_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [sm103_block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*"
+    filter_regex_sm100_mma = f"({sm100_mma_filter_regex_1sm})|" \
+                          f"({sm100_mma_filter_regex_2sm})|" \
+                          f"({block_scaled_filter_regex_1sm})|" \
+                          f"({block_scaled_filter_regex_2sm})" \
+                          f"({sm103_block_scaled_filter_regex_1sm})|" \
+                          f"({sm103_block_scaled_filter_regex_2sm})"
+    # CTA tiles for sm120 MMA - only run one tile size to reduce build/test times
+    sm120_mma_kernel_cta_tiles = [
+      # h1688, s1688, i16832, i8816
+      [ '256x128' ],
+      # d884, c1688,
+      [ '128x128' ],
+      # c1688, z884
+      [ '128x64' ],
+      # gz884
+      [ '64x64' ]
+    ]
+    # sm120 MMA instruction shapes, planar complex type excluded as they are not required
+    sm120_mma_instruction_shapes = [
+      [ 'h1688gemm_(?!planar_complex)',
+        's1688gemm_f16',
+        's1688gemm_bf16',
+        's1688gemm_tf32',
+        'i16832gemm',
+        'i8816gemm' ],
+      [ 'd884gemm', 'c1688tf32gemm' ] ,
+      [ 'c1688gemm',
+        'z884gemm'  ],
+      [ 'gz884gemm']
+    ]
+    # It's not pretty, but not sure why different instructions support different tile sizes.
+    filter_regex_sm120_mma_0 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm120_mma_instruction_shapes[0], sm120_mma_kernel_cta_tiles[0]]]) + ").*"
+    filter_regex_sm120_mma_1 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm120_mma_instruction_shapes[1], sm120_mma_kernel_cta_tiles[1]]]) + ").*"
+    filter_regex_sm120_mma_2 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm120_mma_instruction_shapes[2], sm120_mma_kernel_cta_tiles[2]]]) + ").*"
+    filter_regex_sm120_mma_3 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm120_mma_instruction_shapes[3], sm120_mma_kernel_cta_tiles[3]]]) + ").*"
+    filter_regex_sm120_mma = f"({filter_regex_sm120_mma_0})|({filter_regex_sm120_mma_1})|({filter_regex_sm120_mma_2})|({filter_regex_sm120_mma_3})"
+    problem_waves = [0.5, 1.25, 2.5]
+    if arch in ["120a", "120f", "121a", "121f"]:
+      kernel_filter = f"({filter_regex_sm120_mma})"
+    else:
+      kernel_filter = f"({filter_regex_sm100_mma})"
+  else:
+    raise ValueError()
+  outfile_name    = os.path.join(curr_build_dir, f"FK_{mode}_testlist_SM{arch}_cutlass3x_gemm.csv")
+  audit_file_name = os.path.join(curr_build_dir, f"FK_{mode}_audit_SM{arch}_cutlass3x_gemm.csv")
+  audit_file_params_name = os.path.join(curr_build_dir, f"FK_{mode}_audit_params_SM{arch}_cutlass3x_gemm.csv")
+  kernel_filter_re = re.compile(kernel_filter)
+  testcase_counter = 0
+  kernels_emitted = 0
+  kernels_total = 0
+  perf_json_list = []
+  kernel_name_set = set()
+  testlist_csv_fields = ["testcase", "metadata"]
+  testlist_csv_rows = []
+  auditlist_csv_map = {}
+  auditlist_csv_params_map = {}
+  kernel_features = {}
+  for cc in manifest.operations[OperationKind.Gemm].keys():
+    for kernel_name, operation_l in manifest.operations[OperationKind.Gemm][cc].items():
+      assert(len(operation_l) == 1)
+      kernels_total += 1
+      if len(kernel_filter_re.findall(kernel_name)) == 0:
+          continue
+      # Only test f16 I/O void C kernels in void C kernel set
+      # Exception: Use void C kernels for more accurate perf testing
+      if '_void_' in kernel_name and  'perf_' not in mode:
+        if 'f16_f16_f16_void_f16' not in kernel_name :
+          continue
+      kernels_emitted += 1
+      kernel_name_set.add(kernel_name)
+      hashed_kernel_name = hash_cutlass_string(kernel_name)
+      operation = operation_l[0]
+      dynamic_cluster = (operation.tile_description.cluster_shape[0] == 0
+                          or operation.tile_description.cluster_shape[1] == 0)
+      dynamic_datatype = "f8" in kernel_name or "f6" in kernel_name or "f4" in kernel_name
+      runtime_input_datatypes = [None]
+      if dynamic_datatype:
+        if "f4_f4" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e2m1']]
+        elif "f4_f6" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e3m2']]
+        elif "f4_f8" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e4m3']]
+        elif "f6_f4" in kernel_name:
+          runtime_input_datatypes = [['e3m2','e2m1']]
+        elif "f6_f6" in kernel_name:
+          runtime_input_datatypes = [['e3m2','e3m2']]
+        elif "f6_f8" in kernel_name:
+          runtime_input_datatypes = [['e3m2','e4m3']]
+        elif "f8_f4" in kernel_name:
+          runtime_input_datatypes = [['e4m3','e2m1']]
+        elif "f8_f6" in kernel_name:
+          runtime_input_datatypes = [['e4m3','e3m2']]
+        elif "f8_f8" in kernel_name:
+          runtime_input_datatypes = [
+                                    # mask out those not covered in statically encoded test cases
+                                    #  ['e5m2','e4m3'],
+                                    #  ['e4m3','e5m2'],
+                                      ['e4m3','e4m3']
+                                    ]
+        # block scaled kernels
+        elif "ue8m0xf4_ue8m0xf4" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e2m1']]
+        elif "ue4m3xf4_ue4m3xf4" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e2m1']]
+        elif "ue8m0xf4_ue8m0xf6" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e2m3']]
+        elif "ue8m0xf4_ue8m0xf8" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e4m3']]
+        elif "ue8m0xf6_ue8m0xf4" in kernel_name:
+          runtime_input_datatypes = [['e2m3','e2m1']]
+        elif "ue8m0xf6_ue8m0xf6" in kernel_name:
+          runtime_input_datatypes = [['e2m3','e2m3']]
+        elif "ue8m0xf8_ue8m0xf4" in kernel_name:
+          runtime_input_datatypes = [['e4m3','e2m1']]
+        elif "ue8m0xf8_ue8m0xf4" in kernel_name:
+          runtime_input_datatypes = [['e4m3','e2m1']]
+        elif "ue8m0xf8_ue8m0xf6" in kernel_name:
+          runtime_input_datatypes = [['e4m3','e2m3']]
+        elif "ue8m0xf8_ue8m0xf8" in kernel_name:
+          runtime_input_datatypes = [['e4m3','e4m3']]
+      if "bstensorop" in kernel_name or is_blockwise(manifest.operations_by_name[kernel_name].gemm_kind):
+        profiler_flags_for_verification = "host"
+      # reduce L1 test runtime if reference kernel is not running on device.
+      if mode == "functional_L1" and profiler_flags_for_verification == "host" :
+        problem_waves = [0.5, 2.5]
+      if dynamic_cluster:
+        if mode == "functional_L0":
+          runtime_cluster_shapes = [[1,1,1],                   [2,2,1]]
+        else:
+          runtime_cluster_shapes = [[1,1,1], [1,2,1], [2,1,1], [2,2,1], [1,4,1], [4,1,1], [2,4,1], [4,2,1], [4,4,1]]
+          # reduce L1 test runtime if reference kernel is not running on device.
+          if profiler_flags_for_verification == "host":
+            runtime_cluster_shapes = [[1,1,1], [1,2,1], [2,1,1], [2,2,1], [1,4,1], [4,1,1]]
+        cta_tile_shape_m, cta_tile_shape_n, cta_tile_shape_k = operation.tile_description.threadblock_shape
+      else:
+        runtime_cluster_shapes = [operation.tile_description.cluster_shape]
+        cta_tile_shape_m = int(operation.tile_description.threadblock_shape[0] / operation.tile_description.cluster_shape[0])
+        cta_tile_shape_n = int(operation.tile_description.threadblock_shape[1] / operation.tile_description.cluster_shape[1])
+        cta_tile_shape_k = int(operation.tile_description.threadblock_shape[2] / operation.tile_description.cluster_shape[2])
+      alignment_a = operation.A.alignment
+      alignment_b = operation.B.alignment
+      alignment_c = operation.C.alignment
+      alignment_ab_max = max(alignment_a, alignment_b)
+      layout3x = operation.layout_name_3x()
+      data_types = operation.datatype_name_3x()
+      ctas_per_mma_instruction = 1
+      if '_2sm' in kernel_name:
+        ctas_per_mma_instruction = 2
+        valid_cluster_shapes = []
+        # Remove any cluster shapes that have cluster_m that is not divisible by 2
+        for cs in runtime_cluster_shapes:
+          if cs[0] % 2 == 0:
+            valid_cluster_shapes.append(cs)
+        runtime_cluster_shapes = valid_cluster_shapes
+      kernel_problem_waves = problem_waves
+      if mode == "functional_L0" or mode == "functional_L1":
+        # for functional testing, we want to perturb just a little from even shapes
+        # large K = 8 is chosen such that some kernels will warp around their smem buffers, and some will not
+        # -16 ensures that we are TMA aligned even for FP8/Int8
+        min_k = alignment_ab_max if cta_tile_shape_k == alignment_ab_max else cta_tile_shape_k - alignment_ab_max
+        max_k = (cta_tile_shape_k*8) - alignment_ab_max
+        problem_shapes_k = [min_k, max_k]
+        sm_count = 16
+        swizzle_sizes = [0]
+        # Larger k and less than half wave trigger streamk +separate reduction case to be generated
+        if 'stream_k' in kernel_name:
+          problem_shapes_k = [max_k, cta_tile_shape_k*32]
+          kernel_problem_waves = [0.125, 1.25, 2.5]
+      else:
+        raise ValueError
+      if "void" in kernel_name:
+        beta_values = [0]
+      alignment_shift_m = max(alignment_c, alignment_a)
+      alignment_shift_n = max(alignment_c, alignment_b)
+      is_first_line = True
+      for index_waves, waves in enumerate(kernel_problem_waves):
+        for index_k, k in enumerate(problem_shapes_k):
+          for beta in beta_values:
+            for cluster_shape in runtime_cluster_shapes:
+              for runtime_input_datatype in runtime_input_datatypes:
+                for swizzle_size in swizzle_sizes:
+                  grid_size = waves * sm_count
+                  cluster_shape_m, cluster_shape_n, cluster_shape_k = tuple(cluster_shape)
+                  if cluster_shape_m >= cluster_shape_n:
+                    grid_m = cluster_shape_m
+                    grid_n = grid_size / grid_m
+                    grid_n = max( int((grid_n + cluster_shape_n - 1) / cluster_shape_n) * cluster_shape_n, 1)
+                  else:
+                    grid_n = cluster_shape_n
+                    grid_m = grid_size / grid_n
+                    grid_m = max( int((grid_m + cluster_shape_m - 1) / cluster_shape_m) * cluster_shape_m, 1)
+                  verification_required = False
+                  if mode == "functional_L0" or mode == "functional_L1":
+                    if '_void_' not in kernel_name:
+                      verification_required = True
+                    m = max(int(grid_m * cta_tile_shape_m), alignment_ab_max)
+                    n = max(int(grid_n * cta_tile_shape_n), alignment_ab_max)
+                    k = int(k)
+                    # For functional testing, we want to perturb just a little from even shapes.
+                    # Only do this if the perturbation does not cause one of the dimensions of the
+                    # problem size to go to zero. This can occur for blockscaling kernels for which
+                    # the alignment requirements for A and B can be quite large (e.g., 256).
+                    if m > alignment_shift_m:
+                      m -= alignment_shift_m
+                    if n > alignment_shift_n:
+                      n -= alignment_shift_n
+                    if '_n32t32_' in kernel_name:
+                      continue
+                  batch_count = 1
+                  if mode == "functional_L0" or mode == "functional_L1" :
+                    if index_waves == 0 and index_k == 0 :
+                      batch_count = 3 if mode == "functional_L0" else 5
+                  gemm_op = "gemm"
+                  grouped = is_grouped(manifest.operations_by_name[kernel_name].gemm_kind)
+                  num_groups = 1
+                  if grouped:
+                    gemm_op = "grouped_gemm"
+                    num_groups = 3 # small to limit test time in host block-scaled reference kernels
+                    batch_count = 1
+                  elif "bstensorop" in kernel_name:
+                    gemm_op = "block_scaled_gemm"
+                  elif is_blockwise(manifest.operations_by_name[kernel_name].gemm_kind):
+                    gemm_op = "blockwise_gemm"
+                  problem_size_category = ['smallK','largeK'][index_k] + '_' + ['beta==0','beta!=0'][bool(beta)]
+                  assert m > 0 and n > 0 and k > 0
+                  # Emit per-testcase metadata for perf testing usage, eventually in perf database
+                  metadata_dict = {
+                    "input_params": {
+                      'problem_size_category' : problem_size_category,
+                      'operation' : _getSubOperationType(operation),
+                      'datatype' : data_types,
+                      'layout' : layout3x,
+                      'm' : m,
+                      'n' : n,
+                      'k' : k,
+                      'beta' : beta,
+                      'flops_per_byte' : _computeFlopsPerByte(operation, m, n, k, batch_count, beta, num_groups)
+                    },
+                    "runtime_params": {
+                      'ctas_per_mma_instruction' : ctas_per_mma_instruction,
+                      'tilesize_m' : cta_tile_shape_m,
+                      'tilesize_n' : cta_tile_shape_n,
+                      'tilesize_k' : cta_tile_shape_k,
+                      'cluster_shape_m' : cluster_shape_m,
+                      'cluster_shape_n' : cluster_shape_n,
+                    }
+                  }
+                  cluster_m_fallback = ctas_per_mma_instruction if dynamic_cluster else cluster_shape_m
+                  cluster_n_fallback = 1 if dynamic_cluster else cluster_shape_n
+                  cluster_k_fallback = 1 if dynamic_cluster else cluster_shape_k
+                  if dynamic_datatype:
+                    runtime_datatype_a, runtime_datatype_b = tuple(runtime_input_datatype)
+                    metadata_dict["runtime_params"]["runtime_datatype_a"] = runtime_datatype_a
+                    metadata_dict["runtime_params"]["runtime_datatype_b"] = runtime_datatype_b
+                  testcase_metadata = [
+                    f"cutlass_profiler --operation={gemm_op}" +
+                    (f" --verification-providers=device --providers=cutlass" if profiler_flags_for_verification == "device" else " --mode=trace") +
+                    f" --error-on-no-match --error-if-nothing-is-profiled" +
+                    f" --kernels={kernel_name}" +
+                    f" --m={str(m)}" +
+                    f" --n={str(n)}" +
+                    f" --k={str(k)}" +
+                    (f" --num_groups={str(num_groups)}" if grouped else "") +
+                    f" --cluster_m={str(cluster_shape_m)}" +
+                    f" --cluster_n={str(cluster_shape_n)}" +
+                    f" --cluster_k={str(cluster_shape_k)}" +
+                    f" --cluster_m_fallback={str(cluster_m_fallback)}" +
+                    f" --cluster_n_fallback={str(cluster_n_fallback)}" +
+                    f" --cluster_k_fallback={str(cluster_k_fallback)}" +
+                    f" --beta={str(beta)}" +
+                    ("" if grouped else f" --batch_count={str(batch_count)}") +
+                    f" --swizzle_size={str(swizzle_size)}" +
+                    f" --verification-required={str(verification_required).lower()}"
+                  ] \
+                  output_dynamic_datatype = dynamic_datatype
+                  if output_dynamic_datatype:
+                    testcase_metadata[0] += (f" --runtime_input_datatype_a={runtime_datatype_a}" +
+                                              f" --runtime_input_datatype_b={runtime_datatype_b}")
+                  testcase_metadata.append(json.dumps(metadata_dict))
+                  testlist_csv_rows.append(testcase_metadata)
+                  testcase_counter += 1
+                  alpha = 1.0
+                  if dynamic_datatype:
+                    hashed_kernel_name = transform_hashed_string(hashed_kernel_name, runtime_datatype_a, runtime_datatype_b)
+                  # If kernel_name is new, initialize its feature set with defaults
+                  if hashed_kernel_name not in kernel_features:
+                    kernel_features[hashed_kernel_name] = {
+                      "is_support_dynamic_cluster": False,
+                      "is_support_dynamic_datatype": False,
+                    }
+                  # Update features for the hashed kernel name
+                  kernel_features[hashed_kernel_name]["is_support_dynamic_cluster"] |= dynamic_cluster
+                  kernel_features[hashed_kernel_name]["is_support_dynamic_datatype"] |= dynamic_datatype
+                  if hashed_kernel_name not in auditlist_csv_params_map:
+                    auditlist_csv_params_map[hashed_kernel_name] = []
+                  audit_row_params = get_kernel_params(
+                    operation,
+                    hashed_kernel_name,
+                    (cluster_shape_m, cluster_shape_n, cluster_shape_k),
+                    (cluster_m_fallback, cluster_n_fallback, cluster_k_fallback),
+                    (m, n, k, batch_count),
+                    alpha, beta,
+                    dynamic_datatype, dynamic_cluster
+                  )
+                  auditlist_csv_params_map[hashed_kernel_name].append(audit_row_params)
+                  if hashed_kernel_name not in auditlist_csv_map:
+                    audit_row = get_kernel_features(operation, hashed_kernel_name, dynamic_datatype, runtime_input_datatype)
+                    auditlist_csv_map[hashed_kernel_name] = audit_row
+  with open(outfile_name, 'w') as testlist_csv:
+    csv_writer = csv.writer(testlist_csv, delimiter=',')
+    csv_writer.writerow(testlist_csv_fields)
+    csv_writer.writerows(testlist_csv_rows)
+  with open(audit_file_name, 'w') as auditlist_csv:
+    csv_writer = csv.writer(auditlist_csv, delimiter=',')
+    csv_writer.writerow(audit_csv_fields)
+    for hashed_kernel_name, row in auditlist_csv_map.items():
+      # Append the dynamic features as "Y" or "N"
+      dynamic_cluster_flag = "Y" if kernel_features[hashed_kernel_name]["is_support_dynamic_cluster"] else "N"
+      dynamic_datatype_flag = "Y" if kernel_features[hashed_kernel_name]["is_support_dynamic_datatype"] else "N"
+      test_count = len(auditlist_csv_params_map[hashed_kernel_name])
+      csv_writer.writerow(row + [dynamic_cluster_flag, dynamic_datatype_flag, test_count])
+  with open(audit_file_params_name, 'w') as auditlist_csv:
+    csv_writer = csv.writer(auditlist_csv, delimiter=',')
+    csv_writer.writerow(audit_csv_runtime_fields)
+    for kernel_index, (hashed_kernel_name, rows) in enumerate(auditlist_csv_params_map.items(), start=1):
+      for i, row in enumerate(rows):
+        if i == 0:
+          csv_writer.writerow([kernel_index, hashed_kernel_name] + row)
+        else:
+          csv_writer.writerow(["", ""] + row)
+  print(f"Generated a total of {testcase_counter} test cases for {kernels_emitted} kernels out of {kernels_total} total.")
+  # Generate a newline separated list of kernel filters
+  assert(len(kernel_name_set) == kernels_emitted)
+  output_filter_enabled = True
+  if output_filter_enabled:
+    kernel_filter_outfile_name = os.path.join(curr_build_dir, f"FK_{mode}_testlist_SM{arch}_cutlass3x_gemm_kernel_filter.list")
+  with open(kernel_filter_outfile_name, "w") as file:
+      kernel_name_set = set(map(lambda x: x.replace("_epi_tma", ""), kernel_name_set))
+      for kernel_name in kernel_name_set:
+          file.write(kernel_name + "\n")
+  # Sort L0 and L1 kernel list and csv file to avoid mixing cutlass3.x kernels and sm120_mma kernels in cutlass2.x generated together.
+  if mode == "functional_L0" or mode == "functional_L1":
+    # Sort the .csv file
+    outfile_name = os.path.join(curr_build_dir, f"FK_{mode}_testlist_SM{arch}_cutlass3x_gemm.csv")
+    with open(outfile_name) as file:
+      data = file.readlines()
+      data.sort()
+    with open(outfile_name, 'w') as file:
+      for i in range(len(data)):
+        file.write(data[i])
+    # Sort the kernel list
+    kernel_filter_outfile_name = os.path.join(curr_build_dir, f"FK_{mode}_testlist_SM{arch}_cutlass3x_gemm_kernel_filter.list")
+    with open(kernel_filter_outfile_name) as file:
+      data = file.readlines()
+      data.sort()
+    with open(kernel_filter_outfile_name, 'w') as file:
+      for i in range(len(data)):
+        file.write(data[i])

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/gemm_operation.py ADDED Viewed

	@@ -0,0 +1,1613 @@

+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for emitting GEMM kernels
+"""
+import collections
+import enum
+import functools
+import logging
+import operator
+import os.path
+import shutil
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+_LOGGER = logging.getLogger(__name__)
+###################################################################################################
+#
+# Data structure modeling a GEMM operation
+#
+###################################################################################################
+#
+class GemmOperation:
+  #
+  def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, D = None,
+      kernel_schedule = KernelScheduleType.ScheduleAuto, epilogue_schedule = EpilogueScheduleType.ScheduleAuto,
+      tile_scheduler = TileSchedulerType.Default, mixed_input_mode = None, mixed_input_shuffle = False,
+      ScaleFactorA = None, ScaleFactorB = None, ScaleFactorD = None,
+      ScaleFactorMVecSize = None, ScaleFactorNVecSize = None, ScaleFactorKVecSize = None):
+    kinds_3x = {
+      GemmKind.Universal3x,
+      GemmKind.SparseUniversal3x,
+      GemmKind.BlockScaledUniversal3x,
+      GemmKind.GroupedUniversal3x,
+      GemmKind.GroupedBlockScaledUniversal3x,
+      GemmKind.BlockwiseUniversal3x,
+      GemmKind.GroupedBlockwiseUniversal3x,
+    }
+    self.is_3x = gemm_kind in kinds_3x
+    self.prefix = "3x" if self.is_3x else ""
+    self.operation_kind = OperationKind.Gemm
+    self.arch = arch
+    self.tile_description = tile_description
+    self.gemm_kind = gemm_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.D = D
+    if is_block_scaled(gemm_kind):
+      self.ScaleFactorA = ScaleFactorA
+      self.ScaleFactorB = ScaleFactorB
+      self.ScaleFactorD = ScaleFactorD["tensor"]
+      self.ScaleFactorVectorSize = ScaleFactorD["vector_size"]
+    if is_blockwise(gemm_kind):
+      self.ScaleFactorMVecSize = ScaleFactorMVecSize
+      self.ScaleFactorNVecSize = ScaleFactorNVecSize
+      self.ScaleFactorKVecSize = ScaleFactorKVecSize
+    if self.D == None:
+      self.D = self.C
+    if not self.is_3x:
+      assert(kernel_schedule == KernelScheduleType.ScheduleAuto)
+      assert(epilogue_schedule == EpilogueScheduleType.ScheduleAuto)
+    self.kernel_schedule = kernel_schedule
+    self.epilogue_schedule = epilogue_schedule
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    if self.is_3x and epilogue_functor == EpilogueFunctor.LinearCombination:
+      self.epilogue_functor = EpilogueFunctor3x.LinearCombination
+    self.swizzling_functor = swizzling_functor
+    self.tile_scheduler = tile_scheduler
+    # Only enable mixed input mode and mixed input shuffle for Hopper
+    self.mixed_input_mode = None
+    if self.is_mixed_input() and self.arch >= 90 and self.arch < 100:
+      self.mixed_input_mode = mixed_input_mode
+    self.mixed_input_shuffle = (self.mixed_input_mode is not None) and mixed_input_shuffle
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+  #
+  def is_planar_complex(self):
+    return self.gemm_kind in (GemmKind.PlanarComplex, GemmKind.PlanarComplexArray)
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+    if self.is_complex():
+      return get_complex_from_real(accum)
+    return accum
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and',
+      MathOperation.multiply_add_fast_accum: 'fastaccum',
+    }
+    tensor_ops = [
+      OpcodeClass.TensorOp,
+      OpcodeClass.WmmaTensorOp,
+      OpcodeClass.SparseTensorOp,
+      OpcodeClass.BlockScaledTensorOp,
+    ]
+    is_tensor_op = self.tile_description.math_instruction.opcode_class in tensor_ops
+    if is_tensor_op:
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+      inst_shape = "{0}{1}{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape)) if not self.is_3x else ""
+      inst_shape += math_op_string
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    short_math_name = self.short_math_name() if not self.is_3x else ""
+    return "%s%s%s%s" % (short_math_name, inst_shape, intermediate_type, GemmKindNames[self.gemm_kind])
+  # Generates a string representing the MMA instruction.
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    element_sfa = ""
+    element_sfb = ""
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.is_mixed_input():
+        extended_name = "${core_name}_${element_a}_${element_b}"
+        if self.C.element != self.tile_description.math_instruction.element_accumulator:
+          extended_name = "${element_c}_" + extended_name
+      elif is_blockwise(self.gemm_kind):
+        extended_name = "${core_name}_${element_sfa}x${element_a}_${element_sfb}x${element_b}"
+        element_sfa = DataTypeNames[self.accumulator_type()]
+        element_sfb = DataTypeNames[self.accumulator_type()]
+      else:
+        extended_name = "${core_name}"
+        if self.C.element != self.tile_description.math_instruction.element_accumulator:
+          extended_name = "${element_c}_" + extended_name
+        if self.A.element != self.tile_description.math_instruction.element_accumulator:
+          extended_name += "_${element_a}"
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_sfa' : element_sfa,
+      'element_b': DataTypeNames[self.B.element],
+      'element_sfb' : element_sfb,
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+    return extended_name
+  #
+  def mixed_input_mode_name(self):
+    mode_name_mapping = {
+      MixedInputMode.ConvertOnly: "_cvt",
+      MixedInputMode.ScaleOnly: "_scl",
+      MixedInputMode.ScaleWithZeroPoint: "_sclzr"
+    }
+    mode_name = mode_name_mapping.get(self.mixed_input_mode, "")
+    if self.mixed_input_shuffle:
+      mode_name = mode_name + "_shfl"
+    return mode_name
+  def extended_name_3x(self):
+    '''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
+    extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
+      element_a = DataTypeNames[self.A.element],
+      element_b = DataTypeNames[self.B.element],
+      element_acc = DataTypeNames[self.accumulator_type()],
+      element_c = DataTypeNames[self.C.element],
+      element_d = DataTypeNames[self.D.element],
+      core_name = self.core_name())
+    if is_block_scaled(self.gemm_kind):
+      d_type_names = DataTypeNames[self.D.element]
+      if self.ScaleFactorD.element != DataType.void:
+        d_type_names = DataTypeNames[self.ScaleFactorD.element] + "x" + d_type_names
+      extended_name = "{core_name}_{element_sfa}x{element_a}_{element_sfb}x{element_b}_{element_acc}_{element_c}_{element_d}".format(
+        element_sfa = DataTypeNames[self.ScaleFactorA],
+        element_a = DataTypeNames[self.A.element],
+        element_sfb = DataTypeNames[self.ScaleFactorB],
+        element_b = DataTypeNames[self.B.element],
+        element_acc = DataTypeNames[self.accumulator_type()],
+        element_c = DataTypeNames[self.C.element],
+        element_d = d_type_names,
+        core_name = self.core_name())
+    if is_blockwise(self.gemm_kind):
+      d_type_names = DataTypeNames[self.D.element]
+      extended_name = "{core_name}_{sfvec_m_size}x{sfvec_k_size}{element_sfa}x{element_a}_{sfvec_n_size}x{sfvec_k_size}{element_sfb}x{element_b}_{element_acc}_{element_c}_{element_d}".format(
+        element_sfa = DataTypeNames[self.accumulator_type()],
+        element_a = DataTypeNames[self.A.element],
+        element_sfb = DataTypeNames[self.accumulator_type()],
+        element_b = DataTypeNames[self.B.element],
+        element_acc = DataTypeNames[self.accumulator_type()],
+        element_c = DataTypeNames[self.C.element],
+        element_d = d_type_names,
+        sfvec_m_size = self.ScaleFactorMVecSize,
+        sfvec_n_size = self.ScaleFactorNVecSize,
+        sfvec_k_size = self.ScaleFactorKVecSize,
+        core_name = self.core_name())
+    if self.mixed_input_mode != None:
+      extended_name = extended_name + self.mixed_input_mode_name()
+    return extended_name
+  def datatype_name_3x(self):
+    '''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
+    datatype_name = "{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
+      element_a = DataTypeNames[self.A.element],
+      element_b = DataTypeNames[self.B.element],
+      element_acc = DataTypeNames[self.accumulator_type()],
+      element_c = DataTypeNames[self.C.element],
+      element_d = DataTypeNames[self.D.element])
+    return datatype_name
+  # Generates a short string representing the AB layout tags (e.g. nt or tn)
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+        ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)]
+      )
+    return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout])
+  # Generates a short string representing the ABC layout tags (e.g. ntn or tnn)
+  def layout_name_3x(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "{}{}{}".format(
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+        ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)],
+        ShortComplexLayoutNames[(self.C.layout, self.C.complex_transform)])
+    else:
+      return "{}{}{}".format(
+        ShortLayoutTypeNames[self.A.layout],
+        ShortLayoutTypeNames[self.B.layout],
+        ShortLayoutTypeNames[self.C.layout])
+  # Generates a short string representing underlying kernel schedule type
+  def kernel_schedule_name_3x(self):
+    return KernelScheduleSuffixes[self.kernel_schedule]
+  # Generates a short string representing underlying epilogue schedule type
+  def epilogue_schedule_name_3x(self):
+    if is_block_scaled(self.gemm_kind):
+      if self.ScaleFactorD.element != DataType.void:
+        return EpilogueScheduleSuffixes[self.epilogue_schedule] + "_epiVs" + str(self.ScaleFactorVectorSize)+ShortLayoutTypeNames[self.ScaleFactorD.layout]
+    return EpilogueScheduleSuffixes[self.epilogue_schedule]
+  # Generate a short string representing the operation class
+  def opcode_class_name(self):
+    return OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+  def get_collective_tile_shape(self):
+    """
+    Get the tile shape passed to the collective builder.
+    On Blackwell, this is different than the operation.tile_description.tile_shape.
+    """
+    is_sm100_kernel = (self.arch == 100 or self.arch == 103)
+    if not is_sm100_kernel:
+      return self.tile_description.tile_shape
+    opcode_class_main = self.tile_description.math_instruction.opcode_class
+    instruction_shape = self.tile_description.math_instruction.instruction_shape
+    tile_shape_m, tile_shape_n, tile_shape_k = self.tile_description.tile_shape
+    if opcode_class_main in [OpcodeClass.TensorOp, OpcodeClass.BlockScaledTensorOp, OpcodeClass.SparseTensorOp]:
+      tile_shape_m = instruction_shape[0]
+      tile_shape_n = instruction_shape[1]
+    return (tile_shape_m, tile_shape_n, tile_shape_k)
+  # Generates the full kernel function name
+  def procedural_name(self):
+    return self._procedural_name
+  @functools.cached_property
+  def _procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    if self.arch >= 90:
+      kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}{ct}{cs}_{l}_{s}_align{al}{t}{k}{e}"
+      tile_shape = self.get_collective_tile_shape()
+      return kernel_name_template.format(
+          p = self.prefix,
+          ar = self.arch,
+          op = opcode_class_name,
+          ex = self.extended_name_3x(),
+          ct = '_' + 'x'.join([str(i) for i in tile_shape]) if tile_shape[0] > 0 else "",
+          cs = '_' + 'x'.join([str(i) for i in self.tile_description.cluster_shape]),
+          l = self.tile_description.stages,
+          s = self.layout_name_3x(),
+          al = str(max(self.A.alignment, self.B.alignment)),
+          t = TileSchedulerSuffixes[self.tile_scheduler],
+          k = self.kernel_schedule_name_3x(),
+          e = self.epilogue_schedule_name_3x())
+    else:
+      threadblock = self.tile_description.procedural_name()
+      return "cutlass{p}_{op}_{ex}_{tb}_{l}_align{a}".format(
+          p = self.prefix,
+          op = opcode_class_name,
+          ex = self.extended_name(),
+          tb = threadblock,
+          l = self.layout_name(),
+          a = str(max(self.A.alignment, self.B.alignment)))
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+  def __hash__(self):
+    return hash(self.configuration_name())
+  def __eq__(self, other):
+    return self.configuration_name() == other.configuration_name()
+###################################################################################################
+#
+# Data structure modeling a grouped GEMM operation
+#
+###################################################################################################
+#
+class GroupedGemmOperation(GemmOperation):
+  #
+  def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      scheduler_mode = GroupScheduleMode.Device):
+    super().__init__(gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
+                     epilogue_functor, swizzling_functor)
+    self.scheduler_mode = scheduler_mode
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    base = super().procedural_name()
+    return SubstituteTemplate(
+      base + "_schedule${schedule}",
+      {
+        'schedule': ShortGroupScheduleModeNames[self.scheduler_mode]
+      })
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+#
+class EmitGemmInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = []
+    self.gemm_template = """
+  // Gemm operator ${operation_name}
+  using Operation_${operation_name} = cutlass::gemm::device::Gemm<
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    false,
+    ${math_operation}
+    ${residual}
+  >;
+"""
+    self.gemm_complex_template = """
+  // Gemm operator ${operation_name}
+  using Operation_${operation_name} = cutlass::gemm::device::GemmComplex<
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${transform_a},
+    ${transform_b},
+    ${math_operation}
+    ${residual}
+  >;
+"""
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<Operation_${operation_name}>("${operation_name}"));
+${compile_guard_end}
+"""
+  #
+  def emit(self, operation):
+    warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+    residual = ''
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'residual': residual
+    }
+    template = self.gemm_complex_template if operation.is_complex() else self.gemm_template
+    return SubstituteTemplate(template, values)
+###################################################################################################
+class EmitSparseGemmInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = []
+    self.gemm_template = """
+  // Gemm operator ${operation_name}
+  using Operation_${operation_name} = cutlass::gemm::device::SparseGemm<
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    false,
+    ${math_operation}
+    ${residual}
+  >;
+"""
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<Operation_${operation_name}>("${operation_name}"));
+${compile_guard_end}
+"""
+  #
+  def emit(self, operation):
+    warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+    residual = ''
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'residual': residual
+    }
+    template = self.gemm_template
+    return SubstituteTemplate(template, values)
+###################################################################################################
+#
+class EmitGemmUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = [
+      "cutlass/cutlass.h",
+      "cutlass/numeric_types.h",
+      "cutlass/arch/arch.h",
+      "cutlass/arch/mma.h",
+      "cutlass/layout/matrix.h",
+      "cutlass/gemm/device/gemm.h",
+      "cutlass/gemm/device/gemm_universal_adapter.h",
+      "cutlass/gemm/kernel/default_gemm_universal.h",
+    ]
+    self.builtin_epilogue_functor_template = """
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >
+"""
+    self.gemm_template = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal<
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},    // transposed B operand
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},    // transposed A operand
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation}
+>::GemmKernel;
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+    self.gemm_template_interleaved = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation}
+>::GemmKernel;
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+      cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
+    >("${operation_name}"));
+${compile_guard_end}
+"""
+  #
+  def emit(self, operation):
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+    transpose_layouts = {
+      LayoutType.ColumnMajor: LayoutType.RowMajor,
+      LayoutType.RowMajor: LayoutType.ColumnMajor
+    }
+    if operation.A.layout in transpose_layouts.keys() and \
+      operation.B.layout in transpose_layouts.keys() and \
+      operation.C.layout in transpose_layouts.keys():
+      instance_layout_A = transpose_layouts[operation.A.layout]
+      instance_layout_B = transpose_layouts[operation.B.layout]
+      instance_layout_C = transpose_layouts[operation.C.layout]
+      gemm_template = self.gemm_template
+    else:
+      instance_layout_A, instance_layout_B, instance_layout_C = \
+        (operation.A.layout, operation.B.layout, operation.C.layout)
+      gemm_template = self.gemm_template_interleaved
+    #
+    # Support built-in epilogue functors or user-defined functions
+    if isinstance(operation.epilogue_functor, enum.Enum):
+      epilogue_vector_length = \
+        min(operation.C.alignment * DataTypeSize[operation.C.element], 128) // DataTypeSize[operation.C.element]
+      values = {
+        'epilogue_vector_length': str(epilogue_vector_length),
+        'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+        'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      }
+      epilogue_functor = SubstituteTemplate(self.builtin_epilogue_functor_template, values)
+    else:
+      epilogue_functor = self.epilogue_functor.emit_declaration()
+    #
+    values = {
+      'operation_name': operation.procedural_name(),
+      'operation_suffix': self.operation_suffix,
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[instance_layout_A],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[instance_layout_B],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[instance_layout_C],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_functor': epilogue_functor,
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation]
+    }
+    return SubstituteTemplate(gemm_template, values)
+###################################################################################################
+class EmitGemmUniversal3xInstance:
+  ''' Responsible for emitting a CUTLASS 3.x template definition'''
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = [
+      "cutlass/cutlass.h",
+      "cutlass/gemm/gemm.h",
+      "cutlass/numeric_types.h",
+      "cutlass/gemm/kernel/gemm_universal.hpp",
+      "cutlass/gemm/collective/collective_builder.hpp",
+      "cutlass/epilogue/collective/collective_builder.hpp",
+      "cutlass/detail/blockwise_scale_layout.hpp",
+    ]
+    self.builtin_epilogue_functor_template = \
+"""${epilogue_functor}<
+      ${element_d},
+      ${element_epilogue},
+      ${element_c},
+      ${element_epilogue}
+    >"""
+    self.gemm_template = """
+using ${operation_name}_epilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class_epi},
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
+    cute::Shape<${cluster_shape_m}, ${cluster_shape_n}, ${cluster_shape_k}>,
+    ${epi_tile_mn},
+    ${element_accumulator}, ${element_epilogue},
+    ${element_c}, ${layout_c}, ${align_c},
+    ${element_d}, ${layout_d}, ${align_d},
+    ${epilogue_schedule},
+    ${epilogue_functor}
+  >::CollectiveOp;
+${mixed_dtype_prepare_code}
+${blockwise_prepare_code}
+using ${operation_name}_mainloop =
+  typename cutlass::gemm::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class_main},
+    ${element_a}, ${layout_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${align_b},
+    ${element_accumulator},
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
+    cute::Shape<${cluster_shape_m}, ${cluster_shape_n}, ${cluster_shape_k}>,
+    ${stages},
+    ${kernel_schedule}
+  >::CollectiveOp;
+// Gemm operator ${operation_name}
+using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
+    ${problem_shape},
+    ${operation_name}_mainloop,
+    ${operation_name}_epilogue,
+    ${tile_scheduler}>;
+// Define named type
+struct ${operation_name} :
+  public ${operation_name}_base { };
+"""
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  {
+    using GemmKernel = cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>;
+    manifest.append(
+      new ${gemm_kind}<GemmKernel>("${operation_name}"));
+  }
+${compile_guard_end}
+"""
+  def emit_block_scale_epilogue_functor(self, operation):
+    block_scaled_template = """
+      ${epilogue_functor}<
+        ${epi_vs},
+        ${element_d},
+        ${element_accumulator},
+        ${element_sfd},
+        ${layout_sfd},
+        ${element_c},
+        ${element_scalar}
+      >
+    """
+    block_scaled_values = {
+      'epi_vs'  : str(operation.ScaleFactorVectorSize),
+      'element_d': str(DataTypeTag[operation.D.element]),
+      'element_sfd': str(DataTypeTag[operation.ScaleFactorD.element]),
+      'layout_sfd': LayoutTag[operation.ScaleFactorD.layout],
+      'epilogue_functor': EpilogueFunctor3xTag[EpilogueFunctor3x.LinearCombinationBlockScaleFactor],
+      'element_accumulator': str(DataTypeTag[operation.accumulator_type()]),
+      'element_scalar': str(DataTypeTag[operation.accumulator_type()]),
+      'element_c': str(DataTypeTag[operation.C.element]),
+    }
+    return SubstituteTemplate(block_scaled_template, block_scaled_values)
+  @staticmethod
+  def pointerize_if_grouped(operation, layout):
+    return layout if not is_grouped(operation.gemm_kind) else layout + "* "
+  @staticmethod
+  def transform_layout_A_if_blockwise(operation, layout):
+    layout_sfa = f"{operation.procedural_name()}_LayoutSFA"
+    layout_sfa = layout_sfa if not is_grouped(operation.gemm_kind) else layout_sfa + "* "
+    return layout if not is_blockwise(operation.gemm_kind) else f"cute::tuple<{layout}, {layout_sfa}>"
+  @staticmethod
+  def transform_layout_B_if_blockwise(operation, layout):
+    layout_sfb = f"{operation.procedural_name()}_LayoutSFB"
+    layout_sfb = layout_sfb if not is_grouped(operation.gemm_kind) else layout_sfb + "* "
+    return layout if not is_blockwise(operation.gemm_kind) else f"cute::tuple<{layout}, {layout_sfb}>"
+  @staticmethod
+  def problem_shape(operation):
+    gemm_shape_type = "cute::Shape<int,int,int,int>"
+    grouped_gemm_shape_type = "cute::Shape<int,int,int>"
+    grouped_gemm_shape_type = "cutlass::gemm::GroupProblemShape<" + grouped_gemm_shape_type + ">"
+    return gemm_shape_type if not is_grouped(operation.gemm_kind) else grouped_gemm_shape_type
+  def emit(self, operation):
+    _LOGGER.debug("*** EmitGemmConfigurationLibrary::emit(operation)")
+    _LOGGER.debug("***   operation.procedural_name(): " + operation.procedural_name())
+    _LOGGER.debug("***   tile_shape: " + str(operation.tile_description.tile_shape))
+    _LOGGER.debug("***   warp_count: " + str(operation.tile_description.warp_count))
+    opcode_class_main = operation.tile_description.math_instruction.opcode_class
+    opcode_class_epi = opcode_class_main
+    tile_shape = operation.tile_description.tile_shape
+    instruction_shape = operation.tile_description.math_instruction.instruction_shape
+    cluster_m = operation.tile_description.cluster_shape[0]
+    cluster_n = operation.tile_description.cluster_shape[1]
+    cta_n = tile_shape[1] // cluster_n if cluster_n > 0 else tile_shape[1]
+    tile_shape_m, tile_shape_n, tile_shape_k = operation.get_collective_tile_shape()
+    # stage count set to zero indicates builder automatic stage selection
+    if operation.tile_description.stages > 0:
+      stage_count_string = f"cutlass::gemm::collective::StageCount<{str(operation.tile_description.stages)}>"
+    elif opcode_class_main == OpcodeClass.SparseTensorOp and operation.arch == 100:
+      stage_count_string = f"cutlass::gemm::collective::StageCountAutoCarveoutEpi<{str(operation.procedural_name())}_epilogue>"
+    else:
+      stage_count_string = f"cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename {str(operation.procedural_name())}_epilogue::SharedStorage))>"
+    epi_tile_mn = "cutlass::epilogue::collective::EpilogueTileAuto"
+    instance_layout_A, instance_layout_B, instance_layout_C , instance_layout_D = \
+      (operation.A.layout, operation.B.layout, operation.C.layout, operation.D.layout)
+    # 3.0 profiler integration only supports trivial epilogues for now
+    epilogue_vector_length = 1
+    # Support built-in epilogue functors or user-defined functions
+    if isinstance(operation.epilogue_functor, enum.Enum):
+      values = {
+        'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+        'epilogue_functor': EpilogueFunctor3xTag[operation.epilogue_functor],
+      }
+      epilogue_functor = SubstituteTemplate(self.builtin_epilogue_functor_template, values)
+      if is_block_scaled(operation.gemm_kind) and operation.ScaleFactorD.element != DataType.void:
+        epilogue_functor =  self.emit_block_scale_epilogue_functor(operation)
+    else:
+      epilogue_functor = self.epilogue_functor.emit_declaration()
+      if is_block_scaled(operation.gemm_kind) and operation.ScaleFactorD.element != DataType.void:
+        epilogue_functor =  self.emit_block_scale_epilogue_functor(operation)
+    #
+    # Cutlass3x complex kernels' ElementA(B) is a tuple in collective mainloop builder, e.g. cute::tuple<Element, Transform>, Transform : cute::identity / cute::conjugate.
+    element_a = DataTypeTag[operation.A.element] if not operation.is_complex() else f"cute::tuple<{str(DataTypeTag[operation.A.element])},{str(ComplexTransformTag3x[operation.A.complex_transform])}>"
+    element_b = DataTypeTag[operation.B.element] if not operation.is_complex() else f"cute::tuple<{str(DataTypeTag[operation.B.element])},{str(ComplexTransformTag3x[operation.B.complex_transform])}>"
+    epilogue_schedule_type = EpilogueScheduleTag[operation.epilogue_schedule]
+    if opcode_class_main == OpcodeClass.BlockScaledTensorOp:
+      grouped = is_grouped(operation.gemm_kind)
+      if cta_n == 256 and operation.kernel_schedule == to_grouped_schedule(KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100, grouped):
+        epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
+        if is_tma_epilogue(operation.epilogue_schedule):
+          epilogue_schedule_type = EpilogueScheduleTag[to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped)]
+      if cta_n == 256 and operation.kernel_schedule == to_grouped_schedule(KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100, grouped):
+        epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
+        if is_tma_epilogue(operation.epilogue_schedule):
+          epilogue_schedule_type = EpilogueScheduleTag[to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized2Sm, grouped)]
+      # SM103 FP4 Ultra
+      is_sm103_fp4_ultra_1sm_kernel_schedule = operation.kernel_schedule in [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch, grouped)
+                                                                             ]
+      is_sm103_fp4_ultra_2sm_kernel_schedule = operation.kernel_schedule in [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch, grouped)
+                                                                             ]
+      if cta_n == 256 and is_sm103_fp4_ultra_1sm_kernel_schedule:
+        epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
+        if is_tma_epilogue(operation.epilogue_schedule):
+          epilogue_schedule_type = EpilogueScheduleTag[to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped)]
+      if cta_n == 256 and is_sm103_fp4_ultra_2sm_kernel_schedule:
+        epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
+        if is_tma_epilogue(operation.epilogue_schedule):
+          epilogue_schedule_type = EpilogueScheduleTag[to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized2Sm, grouped)]
+      element_a = f'cute::tuple<{str(element_a)},{str(DataTypeTag[operation.ScaleFactorA])}>'
+      element_b = f'cute::tuple<{str(element_b)},{str(DataTypeTag[operation.ScaleFactorB])}>'
+    alignment_c = get_tma_alignment(operation.C.element) \
+                  if is_tma_epilogue(operation.epilogue_schedule) and opcode_class_epi != OpcodeClass.Simt \
+                  else operation.C.alignment
+    alignment_d = get_tma_alignment(operation.D.element) \
+                  if is_tma_epilogue(operation.epilogue_schedule) and opcode_class_epi != OpcodeClass.Simt \
+                  else operation.D.alignment
+    operation_name_str = operation.procedural_name()
+    layout_a_str = LayoutTag[instance_layout_A]
+    layout_b_str = LayoutTag[instance_layout_B]
+    mixed_dtype_prepare_code = ""
+    if operation.mixed_input_mode != None:
+      A_dtype = operation.A.element
+      B_dtype = operation.B.element
+      A_dtype_bits = DataTypeSize[A_dtype]
+      B_dtype_bits = DataTypeSize[B_dtype]
+      is_A_dtype_narrow = A_dtype_bits < B_dtype_bits
+      if is_A_dtype_narrow:
+        narrow_dtype, wide_dtype = (A_dtype, B_dtype)
+        narrow_dtype_bits, wide_dtype_bits = (A_dtype_bits, B_dtype_bits)
+      else:
+        narrow_dtype, wide_dtype = (B_dtype, A_dtype)
+        narrow_dtype_bits, wide_dtype_bits = (B_dtype_bits, A_dtype_bits)
+      narrow_tag = DataTypeTag[narrow_dtype]
+      wide_tag   = DataTypeTag[wide_dtype]
+      scale_tag  = DataTypeTag[wide_dtype]
+      zero_tag   = DataTypeTag[wide_dtype]
+      do_shuffle = False
+      value_shuffle_str = ""
+      if narrow_dtype_bits == 4 and wide_dtype_bits == 16:
+        value_shuffle_str = "cute::Layout<cute::Shape<cute::_2,cute::_4>, cute::Stride<cute::_4,cute::_1>>"
+        do_shuffle = True
+      if narrow_dtype_bits == 8 and wide_dtype_bits == 16:
+        value_shuffle_str = "cute::Layout<cute::Shape<cute::_2,cute::_2>, cute::Stride<cute::_2,cute::_1>>"
+        do_shuffle = True
+      do_shuffle = operation.mixed_input_shuffle and do_shuffle
+      if do_shuffle:
+        if is_A_dtype_narrow:
+          stride_narrow_str = f"cutlass::detail::TagToStrideA_t<{layout_a_str}>"
+          layout_a_str = f"{operation_name_str}_LayoutNarrowReordered"
+        else:
+          stride_narrow_str = f"cutlass::detail::TagToStrideB_t<{layout_b_str}>"
+          layout_b_str = f"{operation_name_str}_LayoutNarrowReordered"
+        # The {operation_name_str}_ prefixs in mixed_dtype_prepare_code and
+        # layout_{a, b}_str are to prevent errors in Windows platform unity build
+        mixed_dtype_prepare_code = f"""
+using {operation_name_str}_StrideNarrow = {stride_narrow_str};
+using {operation_name_str}_ValueShuffle = {value_shuffle_str};
+static constexpr int {operation_name_str}_NumShuffleAtoms = 1;
+using {operation_name_str}_MmaAtomShape = cute::Layout<cute::Shape<cute::_1, cute::Int<{operation_name_str}_NumShuffleAtoms>>>;
+using {operation_name_str}_LayoutAtomQuant = decltype(cutlass::compute_memory_reordering_atom<{wide_tag}, {operation_name_str}_MmaAtomShape, {operation_name_str}_ValueShuffle>());
+using {operation_name_str}_LayoutNarrowReordered = decltype(cute::tile_to_shape({operation_name_str}_LayoutAtomQuant{{}}, cute::Layout<cute::Shape<int,int,int>, {operation_name_str}_StrideNarrow>{{}}));
+        """
+      mixed_input_modes_to_element = {
+        MixedInputMode.ConvertOnly: narrow_tag,
+        MixedInputMode.ScaleOnly: f"cute::tuple<{narrow_tag}, {scale_tag}>",
+        MixedInputMode.ScaleWithZeroPoint: f"cute::tuple<{narrow_tag}, {scale_tag}, {zero_tag}>"
+      }
+      narrow_element = mixed_input_modes_to_element.get(operation.mixed_input_mode, narrow_tag)
+      if narrow_dtype == DataType.s4 and (wide_dtype == DataType.e4m3 or wide_dtype == DataType.e5m2):
+        narrow_element = f"cute::tuple<{narrow_tag}, cutlass::Array<{scale_tag}, 8>>"
+      if is_A_dtype_narrow:
+        element_a = narrow_element
+      else:
+        element_b = narrow_element
+    blockwise_prepare_code = ""
+    if is_blockwise(operation.gemm_kind):
+      sfm_vec_size = operation.ScaleFactorMVecSize
+      sfn_vec_size = operation.ScaleFactorNVecSize
+      sfk_vec_size = operation.ScaleFactorKVecSize
+      blockwise_prepare_code = f"""
+using {operation_name_str}_ScaleConfig = cutlass::detail::Sm{operation.arch}BlockwiseScaleConfig<{sfm_vec_size}, {sfn_vec_size}, {sfk_vec_size}>;
+using {operation_name_str}_LayoutSFA = decltype({operation_name_str}_ScaleConfig::deduce_layoutSFA());
+using {operation_name_str}_LayoutSFB = decltype({operation_name_str}_ScaleConfig::deduce_layoutSFB());
+      """
+    values = {
+      'operation_name': operation_name_str,
+      'operation_suffix': self.operation_suffix,
+      'problem_shape': self.problem_shape(operation),
+      'element_a': element_a,
+      'layout_a': self.transform_layout_A_if_blockwise(operation, self.pointerize_if_grouped(operation, layout_a_str)),
+      'element_b': element_b,
+      'layout_b': self.transform_layout_B_if_blockwise(operation, self.pointerize_if_grouped(operation, layout_b_str)),
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': self.pointerize_if_grouped(operation, LayoutTag[instance_layout_C]),
+      'element_d': DataTypeTag[operation.D.element],
+      'layout_d': self.pointerize_if_grouped(operation, LayoutTag[instance_layout_D]),
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class_main': OpcodeClassTag[opcode_class_main],
+      'opcode_class_epi': OpcodeClassTag[opcode_class_epi],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'tile_shape_m': str(tile_shape_m),
+      'tile_shape_n': str(tile_shape_n),
+      'tile_shape_k': str(tile_shape_k),
+      'cluster_shape_m': 'cute::_' + str(operation.tile_description.cluster_shape[0]) if operation.tile_description.cluster_shape[0] > 0 else "int",
+      'cluster_shape_n': 'cute::_' + str(operation.tile_description.cluster_shape[1]) if operation.tile_description.cluster_shape[1] > 0 else "int",
+      'cluster_shape_k': 'cute::_' + str(operation.tile_description.cluster_shape[2]) if operation.tile_description.cluster_shape[2] > 0 else "int",
+      'instruction_shape_m': str(instruction_shape[0]),
+      'instruction_shape_n': str(instruction_shape[1]),
+      'instruction_shape_k': str(instruction_shape[2]),
+      'kernel_schedule' : str(KernelScheduleTag[operation.kernel_schedule]),
+      'epilogue_schedule' : str(epilogue_schedule_type),
+      'epi_tile_mn' : epi_tile_mn,
+      'epilogue_functor': epilogue_functor,
+      'stages': stage_count_string,
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'align_c': str(alignment_c),
+      'align_d': str(alignment_d),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'tile_scheduler': str(TileSchedulerTag[operation.tile_scheduler]),
+      'mixed_dtype_prepare_code': mixed_dtype_prepare_code,
+      'blockwise_prepare_code' : blockwise_prepare_code
+    }
+    return SubstituteTemplate(self.gemm_template, values)
+###################################################################################################
+#
+class EmitGemmPlanarComplexInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = []
+    self.template = """
+  // Gemm operator ${operation_name}
+  using Operation_${operation_name} = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
+    ${element_a}, ${layout_a}, ${transform_a}, ${alignment_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${alignment_b},
+    ${element_c}, cutlass::layout::RowMajor,
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    cutlass::epilogue::thread::LinearCombinationPlanarComplex<
+      ${element_c},
+      ${alignment_c},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator}
+  >::GemmKernel;
+  struct ${operation_name} :
+    public Operation_${operation_name} { };
+"""
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+    cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+  #
+  def emit(self, operation):
+    warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
+    # exchange and transpose A and B types, layouts, and complex transforms since the C layout is row-major
+    transposed_layout_A = TransposedLayout[operation.A.layout]
+    transposed_layout_B = TransposedLayout[operation.B.layout]
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.B.element],
+      'layout_a': LayoutTag[transposed_layout_B],
+      'transform_a': ComplexTransformTag[operation.B.complex_transform],
+      'alignment_a': str(operation.B.alignment),
+      'element_b': DataTypeTag[operation.A.element],
+      'layout_b': LayoutTag[transposed_layout_A],
+      'transform_b': ComplexTransformTag[operation.A.complex_transform],
+      'alignment_b': str(operation.A.alignment),
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.tile_description.math_instruction.element_accumulator],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'alignment_c': str(operation.C.alignment),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'stages': str(operation.tile_description.stages),
+      'math_operator': 'cutlass::arch::OpMultiplyAdd'
+    }
+    return SubstituteTemplate(self.template, values)
+###################################################################################################
+#
+class EmitGemmPlanarComplexArrayInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = []
+    self.template = """
+  // Gemm operator ${operation_name}
+  using Operation_${operation_name} = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
+    ${element_a}, ${layout_a}, ${transform_a}, ${alignment_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${alignment_b},
+    ${element_c}, cutlass::layout::RowMajor,
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    cutlass::epilogue::thread::LinearCombinationPlanarComplex<
+      ${element_c},
+      ${alignment_c},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator}
+  >::GemmArrayKernel;
+  struct ${operation_name} : public Operation_${operation_name} { };
+"""
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+    cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+  #
+  def emit(self, operation):
+    warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
+    # exchange and transpose A and B types, layouts, and complex transforms since the C layout is row-major
+    transposed_layout_A = TransposedLayout[operation.A.layout]
+    transposed_layout_B = TransposedLayout[operation.B.layout]
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.B.element],
+      'layout_a': LayoutTag[transposed_layout_B],
+      'transform_a': ComplexTransformTag[operation.B.complex_transform],
+      'alignment_a': str(operation.B.alignment),
+      'element_b': DataTypeTag[operation.A.element],
+      'layout_b': LayoutTag[transposed_layout_A],
+      'transform_b': ComplexTransformTag[operation.A.complex_transform],
+      'alignment_b': str(operation.A.alignment),
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.tile_description.math_instruction.element_accumulator],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'alignment_c': str(operation.C.alignment),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'stages': str(operation.tile_description.stages),
+      'math_operator': 'cutlass::arch::OpMultiplyAdd'
+    }
+    return SubstituteTemplate(self.template, values)
+###################################################################################################
+#
+class EmitGemmGroupedInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = [
+      "cutlass/cutlass.h",
+      "cutlass/numeric_types.h",
+      "cutlass/arch/arch.h",
+      "cutlass/arch/mma.h",
+      "cutlass/layout/matrix.h",
+      "cutlass/gemm/device/gemm.h",
+      "cutlass/gemm/kernel/gemm_grouped.h",
+      "cutlass/gemm/kernel/default_gemm_grouped.h",
+      "cutlass/gemm/device/gemm_grouped.h"
+    ]
+    self.builtin_epilogue_functor_template = \
+"""${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >"""
+    self.gemm_template = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+  typename cutlass::gemm::kernel::DefaultGemmGrouped<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${scheduler_mode},
+    ${math_operation}
+>::GemmKernel;
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+    cutlass::gemm::device::GemmGrouped<${operation_name}>
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+  #
+  def emit(self, operation):
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+    transpose_layouts = {
+      LayoutType.ColumnMajor: LayoutType.RowMajor,
+      LayoutType.RowMajor: LayoutType.ColumnMajor
+    }
+    instance_layout_A, instance_layout_B, instance_layout_C = \
+      (operation.A.layout, operation.B.layout, operation.C.layout)
+    #
+    # Support built-in epilogue functors or user-defined functions
+    if isinstance(operation.epilogue_functor, enum.Enum):
+      epilogue_vector_length = \
+        min(operation.C.alignment * DataTypeSize[operation.C.element], 128) // DataTypeSize[operation.C.element]
+      values = {
+        'epilogue_vector_length': str(epilogue_vector_length),
+        'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+        'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      }
+      epilogue_functor = SubstituteTemplate(self.builtin_epilogue_functor_template, values)
+    else:
+      epilogue_functor = self.epilogue_functor.emit_declaration()
+    #
+    values = {
+      'operation_name': operation.procedural_name(),
+      'operation_suffix': self.operation_suffix,
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[instance_layout_A],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[instance_layout_B],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[instance_layout_C],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_functor': epilogue_functor,
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'scheduler_mode': GroupScheduleModeTag[operation.scheduler_mode],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation]
+    }
+    return SubstituteTemplate(self.gemm_template, values)
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+class EmitGemmConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+    self.instance_emitter = {
+      GemmKind.Gemm: EmitGemmInstance,
+      GemmKind.Sparse: EmitSparseGemmInstance,
+      GemmKind.Universal: EmitGemmUniversalInstance,
+      GemmKind.Universal3x: EmitGemmUniversal3xInstance,
+      GemmKind.SparseUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.BlockScaledUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.PlanarComplex: EmitGemmPlanarComplexInstance,
+      GemmKind.PlanarComplexArray: EmitGemmPlanarComplexArrayInstance,
+      GemmKind.Grouped: EmitGemmGroupedInstance,
+      GemmKind.GroupedUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.GroupedBlockScaledUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.BlockwiseUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.GroupedBlockwiseUniversal3x: EmitGemmUniversal3xInstance,
+    }
+    self.gemm_kind_wrappers = {
+      GemmKind.Gemm: 'GemmOperation',
+      GemmKind.Sparse: 'GemmSparseOperation',
+      GemmKind.Universal: 'GemmUniversalOperation',
+      GemmKind.Universal3x: 'GemmUniversal3xOperation',
+      GemmKind.SparseUniversal3x: 'SparseGemmUniversal3xOperation',
+      GemmKind.BlockScaledUniversal3x: 'BlockScaledGemmUniversal3xOperation',
+      GemmKind.PlanarComplex: 'GemmPlanarComplexOperation',
+      GemmKind.PlanarComplexArray: 'GemmPlanarComplexArrayOperation',
+      GemmKind.Grouped: 'GemmGroupedOperation',
+      GemmKind.GroupedUniversal3x: 'GroupedGemmUniversal3xOperation',
+      GemmKind.GroupedBlockScaledUniversal3x: 'GroupedBlockScaledGemmUniversal3xOperation',
+      GemmKind.BlockwiseUniversal3x: 'BlockwiseGemmUniversal3xOperation',
+      GemmKind.GroupedBlockwiseUniversal3x: 'GroupedBlockwiseGemmUniversal3xOperation',
+    }
+    self.wmma_guard_start = "#if defined(CUTLASS_ARCH_WMMA_SM${sm_number}_ENABLED)"
+    self.separator = """
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+    self.header_template = """
+/*
+  Generated by gemm_operation.py - Do not edit.
+*/
+"""
+    self.initialize_function_template = """
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace library {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+void initialize_${configuration_name}(Manifest &manifest) {
+"""
+    self.epilogue_template = """
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace library
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+  def __enter__(self):
+    _LOGGER.debug("*** EmitGemmConfigurationLibrary::__enter__")
+    _LOGGER.debug("***   configuration_path (file to write): " +
+                  str(self.configuration_path))
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+    self.configuration_file.write(self.separator)
+    self.includes = collections.OrderedDict([
+      ("cutlass/cutlass.h", None),
+      ("cutlass/library/library.h", None),
+      ("cutlass/library/manifest.h", None),
+      ("library_internal.h", None),
+      ("gemm_operation.h", None),
+      ("gemm_operation_3x.hpp", None),
+      ("grouped_gemm_operation_3x.hpp", None),
+      ("sparse_gemm_operation_3x.hpp", None),
+      ("block_scaled_gemm_operation_3x.hpp", None),
+      ("blockwise_gemm_operation_3x.hpp", None),
+      ("cutlass/arch/wmma.h", None),
+      ("cutlass/numeric_types.h", None)
+    ])
+    self.instance_definitions = []
+    self.instance_wrappers = []
+    self.operations = []
+    return self
+  def emit(self, operation):
+    _LOGGER.debug("*** EmitGemmConfigurationLibrary::emit(operation)")
+    _LOGGER.debug("***   operation.gemm_kind: " + str(operation.gemm_kind))
+    emitter = self.instance_emitter[operation.gemm_kind]()
+    for incl in emitter.includes:
+      self.includes[incl] = None
+    self.operations.append(operation)
+    self.instance_definitions.append(emitter.emit(operation))
+    self.instance_wrappers.append(SubstituteTemplate(emitter.instance_template(), {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'gemm_kind': self.gemm_kind_wrappers[operation.gemm_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
+      }))
+  def __exit__(self, exception_type, exception_value, traceback):
+    # Write includes
+    for incl, _ in self.includes.items():
+      include_statement = "#include \"%s\"\n" % incl
+      self.configuration_file.write(include_statement)
+    self.configuration_file.write(self.separator)
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+###################################################################################################
+###################################################################################################

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/generator.py ADDED Viewed

The diff for this file is too large to render. See raw diff

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/heuristics.py ADDED Viewed

	@@ -0,0 +1,415 @@

+#################################################################################################
+#
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for selecting CUTLASS library kernels based on problem description
+"""
+import json
+import csv
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.generator import *
+  from cutlass_library.heuristics_provider import *
+except ImportError:
+  from library import *
+  from generator import *
+  from heuristics_provider import *
+try:
+  from .sm90_utils import (
+    get_valid_schedules,
+    generate_data_types_from_math_instruction,
+    fix_alignments,
+  )
+except ImportError:
+  from sm90_utils import (
+    get_valid_schedules,
+    generate_data_types_from_math_instruction,
+    fix_alignments,
+  )
+_LOGGER = logging.getLogger(__name__)
+dtype_map = {v: k for k, v in DataTypeNames.items()}
+def serialize_heuristics_results_to_json(problems_with_configs, outfile_path):
+  """
+  Utilitiy function to write heuristics results to a json file for debug
+  args:
+    problems_with_configs: List of problems provided to the heuristic, with a list of operations added to each problem dict
+    outfile_path: Outfile path
+  returns:
+    None
+  """
+  pc_copy = problems_with_configs.copy()
+  for p in pc_copy:
+    for k, v in p.items():
+      if isinstance(v, DataType):
+        p[k] = DataTypeNames[v]
+      elif isinstance(v, LayoutType):
+        p[k] = ShortLayoutTypeNames[v]
+    configs = p['configs']
+    for c in configs:
+      for k, v in c.items():
+        if isinstance(v, DataType):
+          c[k] = DataTypeNames[v]
+        elif isinstance(v, LayoutType):
+          c[k] = ShortLayoutTypeNames[v]
+  with open(outfile_path, 'w') as f:
+    json.dump(pc_copy, f, indent=2)
+def get_single_gemm_config(m, n, k, batch_count, layouts, dtypes, alignment_a, alignment_b, voidC=False, use_fast_acc=True, count=1, provider=None):
+  """
+  Get heuristic-suggested GEMM kernel configurations for a single GEMM problem.
+  args:
+    m, n, k: GEMM dimensions
+    batch_count: batch count
+    layouts: tuple of layouts of type LayoutType
+    use_fast_acc: Use fast accumulation for FP8. Ignored for other precisions
+    count: Number of configs to return
+    provider: Heuristics provider to use
+  returns:
+    A list of dictionaries containing the suggested kernel configurations and additional info from the input required to define a Cutlass GemmOperation, with the following keys:
+      - 'cta_tile_m', 'cta_tile_m', 'cta_tile_k': CTA tile size
+      - 'instr_tile_m', 'instr_tile_n', 'instr_tile_k': Instruction tile size
+      - 'stages': kernel pipeline stage count
+      - 'cluster_m', 'cluster_n', 'cluster_k': cluster size
+      - 'layout_a', 'layout_b': input tensor layouts of type LayoutType
+      - 'alignment_a', 'alignment_b': input tensor alignments, in count of elements
+      - 'dtype_a', 'dtype_b', 'dtype_acc': dtypes of a, b, and accumulator, of type DataType
+      - 'swizzle_size' : suggested threadblock swizzle
+      - 'split_k_slices': number of partitions of the k dimension for splitK
+      - 'raster_order': raster order for CTAs over output tiles ('along_m' or 'along_n')
+  """
+  if provider is None:
+    provider = MatmulHeuristics()
+  return provider.get_configs(m, n, k, batch_count, dtypes, layouts, alignment_a, alignment_b, voidC=voidC, use_fast_acc=use_fast_acc, count=count)
+def get_gemm_configs(problems, provider=None, count=1):
+  """
+  Get heuristic-suggested GEMM kernel configurations for a set of GEMM problems.
+  args:
+    problems: List of dictionaries describing GEMM problems with the following keys:
+      - 'm', 'n', 'k': Matrix dimensions (required)
+      - 'dtype_a': Data type of matrix A (required)
+      - 'dtype_b': Data type of matrix B (required)
+      - 'dtype_c': Data type of matrix C (default: None)
+      - 'dtype_d': Data type of matrix D (required)
+      - 'dtype_acc': Compute data type (default 'f32')
+      - 'layout': Operation layout (e.g. 'tnt')
+      - 'alignment_a': Memory access granularity of A, in units of elements (default: 16 bytes equivalent elements)
+      - 'alignment_b': Memory access granularity of B, in units of elements (default: 16 bytes equivalent elements)
+      - 'alpha': Scalar multiplier for A*B (default: 1.0)
+      - 'beta': Scalar multiplier for C (default: 0.0)
+      - 'batch_count': Number of GEMM operations in batch (default: 1)
+      - 'use_fast_acc': Enable fast accumulation for FP8 on Hopper (default: True)
+    provider: Heuristics provider to use
+    count: Number of configurations to return per problem (defualt: 1)
+  returns:
+    A copy of the input dictionary, with key `configs` added containing the selected gemm configs
+  """
+  ret = []
+  for problem in problems:
+    problem = problem.copy()
+    try:
+      m = problem['m']
+      n = problem['n']
+      k = problem['k']
+      dtype_a = problem['dtype_a']
+      dtype_b = problem['dtype_b']
+      dtype_d = problem['dtype_d']
+      layout = problem['layout']
+    except KeyError as e:
+      _LOGGER.error(f"Missing required parameter {e} for problem {problem}")
+      raise
+    operation = problem.get('operation', 'gemm')
+    batch_count = problem.get('batch_count', 1)
+    dtype_acc = problem.get('dtype_acc', 'f32')
+    dtype_c = problem.get('dtype_c', None)
+    alpha = problem.get('alpha', 1.0)
+    beta = problem.get('beta', 0.0)
+    use_fast_acc = problem.get('use_fast_acc', True)
+    if operation != OperationKindNames[OperationKind.Gemm]:
+      raise ValueError(f"Unsupported operation {operation}")
+    if not (len(layout) == 3 and all(c in "nt" for c in layout)):
+      raise ValueError(f"layout must be a 3-character string containing only 'n' or 't', got {layout}")
+    layouts = tuple(LayoutType.RowMajor if l == 't' else LayoutType.ColumnMajor for l in layout)
+    try:
+      dtype_list = [dtype_a.lower(), dtype_b.lower(), dtype_acc.lower(), dtype_c.lower() if dtype_c is not None else dtype_d.lower(), dtype_d.lower()]
+      dtypes = tuple(dtype_map[dt] for dt in dtype_list)
+    except KeyError as dt:
+      _LOGGER.error(f"Unsupported data type: {dt}")
+      raise
+    alignment_a = problem.get('alignment_a', 128 // DataTypeSize[dtypes[0]])
+    alignment_b = problem.get('alignment_b', 128 // DataTypeSize[dtypes[1]])
+    configs = get_single_gemm_config(m, n, k, batch_count, layouts, dtypes, alignment_a, alignment_b, beta==0.0, use_fast_acc, count, provider)
+    problem['configs'] = configs
+    ret.append(problem)
+  return ret
+def generate_sm100_from_heuristics_configs(manifest, cuda_version, kernel_configs):
+  """
+  Generate CUTLASS operations based on the list of configs provided by the heuristic provider
+  args:
+    manifest: manifest argument to which to add operations, or None to just return the operations without a manifest (for pruning an existing manifest)
+    cuda_version: Cuda compiler version for generating cutlass operations
+    kernel_configs: list of configs generated by the heuristic
+  returns:
+    (configs, operations): a list of heuristic-provided kernel configs along with a one-to-one corresponding list of the generated operations
+  """
+  min_cc = 100
+  max_cc = 101
+  if manifest is None:
+    # Use a dummy manifest so we can use existing CreateGemmOperator functions
+    manifest = Manifest()
+  configs = []
+  operations = []
+  for config in kernel_configs:
+    layout = ([config['layout_a'], config['alignment_a']], [config['layout_b'], config['alignment_b']], [config['layout_d'], 128 // DataTypeSize[config['dtype_d']]])
+    element_a, element_b, element_accumulator, element_c, element_d = config['dtype_a'], config['dtype_b'], config['dtype_acc'], config['dtype_c'], config['dtype_d']
+    # nvMMH assumes 2sm instruction for !(cluster_m % 2)
+    is_2sm = config['cluster_m'] % 2 == 0
+    instruction_shape = [(2 * config['cta_tile_m']) if is_2sm else config['cta_tile_m'], config['cta_tile_n'], config['cta_tile_k'] // 4]
+    math_instruction = MathInstruction(
+      instruction_shape,
+      element_a, element_b, element_accumulator,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add
+    )
+    data_types = [
+      {
+        "a_type"   : math_instruction.element_a,
+        "b_type"   : math_instruction.element_b,
+        "c_type"   : DataType.void if config['voidC'] else math_instruction.element_accumulator,
+        "d_type"   : element_d,
+        "acc_type" : math_instruction.element_accumulator,
+        "epi_type" : math_instruction.element_accumulator,
+      }
+    ]
+    tile_multiplier = (config['cluster_m'] // (2 if is_2sm else 1), config['cluster_n'], config['cluster_k'])
+    tile_description = TileDescription(
+      [instruction_shape[0] * tile_multiplier[0],
+       instruction_shape[1] * tile_multiplier[1],
+       instruction_shape[2] * 4 * tile_multiplier[2]],
+      0,
+      [4,1,1],
+      math_instruction,
+      min_cc,
+      max_cc,
+      cluster_shape=(config['cluster_m'], config['cluster_n'], config['cluster_k'])
+    )
+    schedules = []
+    if is_2sm:
+      schedules.append([KernelScheduleType.TmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm])
+    else:
+      schedules.append([KernelScheduleType.TmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm])
+    for o in CreateGemmUniversal3xOperator(manifest, [layout], [tile_description], data_types, schedules, tile_schedulers=[TileSchedulerType.Default, TileSchedulerType.StreamK], gemm_kind=GemmKind.Universal3x):
+      configs.append(config)
+      operations.append(o)
+  return configs, operations
+def generate_sm90_from_heuristics_configs(manifest, cuda_version, kernel_configs):
+  """
+  Generate CUTLASS operations based on the list of configs provided by the heuristic provider
+  args:
+    manifest: manifest argument to which to add operations, or None to just return the operations without a manifest (for pruning an existing manifest)
+    cuda_version: Cuda compiler version for generating cutlass operations
+    kernel_configs: list of configs generated by the heuristic
+  returns:
+    (configs, operations): a list of heuristic-provided kernel configs along with a one-to-one corresponding list of the generated operations
+  """
+  min_cc, max_cc = 90, 90
+  if manifest is None:
+    # Use a dummy manifest so we can use existing CreateGemmOperator functions
+    manifest = Manifest()
+  configs = []
+  operations = []
+  for config in kernel_configs:
+    is_aligned = (config['alignment_a'] * DataTypeSize[config['dtype_a']] >= 128) and (config['alignment_b'] * DataTypeSize[config['dtype_b']] >= 128)
+    layout = ([config['layout_a'], config['alignment_a']], [config['layout_b'], config['alignment_b']], [LayoutType.ColumnMajor, 1])
+    element_a, element_b, element_accumulator, element_c, element_d = config['dtype_a'], config['dtype_b'], config['dtype_acc'], config['dtype_c'], config['dtype_d']
+    # instr shape and warp config are unused for emitting 3x collective builder code
+    dummy_instr_shape = [0, 0, 0]
+    math_instruction = MathInstruction(
+      dummy_instr_shape,
+      element_a, element_b, element_accumulator,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add
+    )
+    data_types = generate_data_types_from_math_instruction(math_instruction, element_source=element_c, element_dest=element_d)
+    if is_aligned:
+      layout = fix_alignments(data_types, layout, alignment_bits=128)
+    # instr shape and warp config are unused for emitting 3x collective builder code
+    dummy_warp_count = [0, 0, 0]
+    tile_description = TileDescription(
+      [config['cta_tile_m'], config['cta_tile_n'], config['cta_tile_k']],
+      0,
+      dummy_warp_count,
+      math_instruction,
+      min_cc,
+      max_cc,
+      cluster_shape=(config['cluster_m'], config['cluster_n'], config['cluster_k'])
+    )
+    schedules, stream_k_schedules = get_valid_schedules(
+      tile_description=tile_description,
+      cuda_version=cuda_version,
+      is_aligned=is_aligned,
+      data_types=data_types,
+      instantiation_level=9000, # don't prune schedules: we didn't get any schedule suggestion from the heuristic
+      layout=layout,
+      gemm_kind=GemmKind.Universal3x,
+      enable_fp8_fast_acc=config['use_fast_acc']
+    )
+    if len(schedules):
+      for o in CreateGemmUniversal3xOperator(manifest, [layout], [tile_description], data_types, schedules, gemm_kind=GemmKind.Universal3x):
+        configs.append(config)
+        operations.append(o)
+    if len(stream_k_schedules):
+      for o in CreateGemmUniversal3xOperator(manifest, [layout], [tile_description], data_types,
+                                    stream_k_schedules,
+                                    tile_schedulers=[TileSchedulerType.StreamK]):
+        configs.append(config)
+        operations.append(o)
+  return configs, operations
+def filter_manifest_and_write_heuristics_file(manifest, args):
+  """
+  Prune a manifest according to heuristics suggestions from the problems file
+  args:
+    manifest: Cutlass manifest to prune
+    args: generator.py args, requires:
+      - args.heuristics_problems_file
+      - args.heuristics_gpu
+      - args.heuristics_testlist_file
+  returns:
+    A list of dictionaries, each of which has information about an operation and a problem from the input problems
+  """
+  heuristics_problems = []
+  with open(args.heuristics_problems_file, 'r') as f:
+    heuristics_problems = json.load(f)
+  gpu = None if (args.heuristics_gpu == "auto" or args.heuristics_gpu == "") else args.heuristics_gpu
+  mmh = MatmulHeuristics(gpu=gpu)
+  if any(('100' in arch) for arch in args.architectures.split(';')):
+    mmh.set_cta_div_n(64)
+  problems_with_configs = get_gemm_configs(heuristics_problems, provider=mmh, count=args.heuristics_configs_per_problem)
+  all_configs_and_operations = []
+  operations = []
+  for problem in problems_with_configs:
+    if any('90' in arch for arch in args.architectures.split(';')):
+        problem_configs, problem_operations = generate_sm90_from_heuristics_configs(None if args.heuristics_restrict_kernels else manifest, args.cuda_version, problem['configs'])
+    if any(('100' in arch) or ('101' in arch) for arch in args.architectures.split(';')):
+        problem_configs, problem_operations = generate_sm100_from_heuristics_configs(None if args.heuristics_restrict_kernels else manifest, args.cuda_version, problem['configs'])
+    operations += problem_operations
+    problem_without_configs = {k: v for k, v in problem.items() if k != 'configs'}
+    with_problem_size = [{'operation_name': o.procedural_name(), **problem_without_configs, **c} for c, o in zip(problem_configs, problem_operations)]
+    all_configs_and_operations += with_problem_size
+  for operation in operations:
+    manifest.add_kernel_filter(f"^{operation.procedural_name()}$")
+  if not all_configs_and_operations:
+    raise Exception("No valid configurations generated")
+  write_profiler_testlist_to_csv(all_configs_and_operations, args.heuristics_testlist_file)
+  return all_configs_and_operations
+def write_profiler_testlist_to_csv(configs_list, outfile_path):
+  """
+  Write a list of configs to a testlist to be consumed by cutlass_profiler
+  args:
+    configs_list: List of kernel configs along with runtime arguments and any other columns to include in the CSV, expressed as a list of dictionaries
+    outfile_path: Outfile path
+  returns:
+    None
+  """
+  profiler_testlist = configs_list.copy()
+  for c in profiler_testlist:
+    for k, v in c.items():
+      if isinstance(v, DataType):
+        c[k] = DataTypeNames[v]
+      elif isinstance(v, LayoutType):
+        c[k] = ShortLayoutTypeNames[v]
+  with open(outfile_path, mode='w', newline='') as ofile:
+    k_names = profiler_testlist[0].keys()
+    writer = csv.DictWriter(ofile, fieldnames=k_names)
+    writer.writeheader()
+    writer.writerows(profiler_testlist)

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/heuristics_provider.py ADDED Viewed

	@@ -0,0 +1,175 @@

+#################################################################################################
+#
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Providers for kernel selection heuristics
+"""
+import sys
+import os
+import glob
+import logging
+import ctypes
+import functools
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import DataType, LayoutType
+except ImportError:
+  from library import DataType, LayoutType
+class MatmulHeuristics:
+  def __init__(self, gpu = None):
+    import nvMatmulHeuristics
+    self.mmh_lib = nvMatmulHeuristics
+    self.gpu = gpu
+    if 'CUTLASS_NVMMH_SO_PATH' in os.environ:
+      nvmmhInterfaceEx = functools.partial(self.mmh_lib.NvMatmulHeuristicsInterfaceEx, path=os.environ['CUTLASS_NVMMH_SO_PATH'])
+    else:
+      nvmmhInterfaceEx = self.mmh_lib.NvMatmulHeuristicsInterfaceEx
+    self.lh = nvmmhInterfaceEx(
+      backend=self.mmh_lib.NvMatmulHeuristicsTarget["CUTLASS3"],
+      flags=self.mmh_lib.NvMatmulHeuristicsFlags.PERF_MODEL_BASED_AUTO_TUNING,
+      load_discovery_implicitly=True,
+      gpu=self.mmh_lib.NvMatmulHeuristicsNvidiaGpu[self.gpu] if self.gpu else None
+    )
+    self.backend = self.lh.createBackend(self.mmh_lib.NvMatmulHeuristicsTarget["CUTLASS3"])
+  def _layout_from_cutlass(self, layouts):
+    assert(len(layouts)==3)
+    full_layout_str = ''.join('t' if l == LayoutType.RowMajor else 'n' for l in layouts)
+    input_layouts = full_layout_str[:2].upper()
+    lh_layout = input_layouts + '_' + str("ROW_MAJOR" if full_layout_str[-1]=='t' else "COL_MAJOR")
+    return self.mmh_lib.NvMatmulHeuristicsMatmulLayout[lh_layout]
+  def _precision_from_cutlass_dtypes(self, dtypes):
+    dtype_to_cublas = {
+      DataType.f64: 'D',
+      DataType.f32: 'S',
+      DataType.f16: 'H',
+      DataType.bf16: 'T',
+      DataType.e4m3: 'Q',
+      DataType.e5m2: 'R',
+      DataType.s32: 'I',
+      DataType.s8: 'B',
+    }
+    dtype_a, dtype_b, dtype_compute, dtype_c, dtype_d = dtypes
+    a_c = dtype_to_cublas[dtype_a]
+    if a_c.lower() != 'q':
+      return a_c + dtype_to_cublas[dtype_compute] + dtype_to_cublas[dtype_d]
+    else:
+      return a_c + dtype_to_cublas[dtype_b] + dtype_to_cublas[dtype_c] + dtype_to_cublas[dtype_compute] + dtype_to_cublas[dtype_d]
+  def set_cta_div_n(self, div_n):
+    cta_n_div_requirement = ctypes.c_int(div_n)
+    self.lh.setBackendValueProperty(
+      self.backend,
+      self.mmh_lib.NvMatmulHeuristicsBackendProperty.CTA_TILE_N_DIV_REQUIREMENT,
+      ctypes.byref(cta_n_div_requirement),
+      ctypes.sizeof(cta_n_div_requirement)
+    )
+  def set_cta_div_m(self, div_m):
+    cta_m_div_requirement = ctypes.c_int(div_m)
+    self.lh.setBackendValueProperty(
+      self.backend,
+      self.mmh_lib.NvMatmulHeuristicsBackendProperty.CTA_TILE_M_DIV_REQUIREMENT,
+      ctypes.byref(cta_m_div_requirement),
+      ctypes.sizeof(cta_m_div_requirement)
+    )
+  def get_configs(self, m, n, k, batch_count, dtypes, layouts, align_a, align_b, voidC=False, use_fast_acc=True, count=1):
+    if use_fast_acc:
+      disable_fast_acc_for_fp8 = ctypes.c_int(0)
+    else:
+      disable_fast_acc_for_fp8 = ctypes.c_int(1)
+    self.lh.setBackendValueProperty(
+      self.backend,
+      self.mmh_lib.NvMatmulHeuristicsBackendProperty.DISABLE_FAST_ACC_FOR_FP8,
+      ctypes.byref(disable_fast_acc_for_fp8),
+      ctypes.sizeof(disable_fast_acc_for_fp8)
+    )
+    precision = self._precision_from_cutlass_dtypes(dtypes)
+    layout = self._layout_from_cutlass(layouts)
+    matmul_problem = self.lh.makeNvMatmulHeuristicsProblem(m, n, k, layout, batch_count)
+    configs = self.lh.getEx(matmul_problem, count, self.backend, precision=precision)
+    ret = []
+    for c in configs:
+      kernel = c['kernel']
+      problem = c['problem']
+      r = {}
+      r['estimated_runtime'] = c['runtime']
+      r['cta_tile_m'] = kernel.cta_tile_m
+      r['cta_tile_n'] = kernel.cta_tile_n
+      r['cta_tile_k'] = kernel.cta_tile_k
+      r['instr_tile_m'] = kernel.instr_tile_m
+      r['instr_tile_n'] = kernel.instr_tile_n
+      r['instr_tile_k'] = kernel.instr_tile_k
+      r['warp_tile_m'] = kernel.warp_tile_m
+      r['warp_tile_n'] = kernel.warp_tile_n
+      r['warp_tile_k'] = kernel.warp_tile_k
+      r['cluster_m'] = kernel.cluster_m
+      r['cluster_n'] = kernel.cluster_n
+      r['cluster_k'] = 1
+      r['layout_a'] = layouts[0]
+      r['layout_b'] = layouts[1]
+      r['layout_d'] = layouts[2]
+      r['dtype_a'] = dtypes[0]
+      r['dtype_b'] = dtypes[1]
+      r['dtype_acc'] = dtypes[2]
+      r['dtype_c'] = dtypes[3]
+      r['dtype_d'] = dtypes[4]
+      r['alignment_a'] = align_a
+      r['alignment_b'] = align_b
+      r['swizzle_size'] = kernel.swizzle_factor
+      r['raster_order'] = 'along_m' if kernel.cta_order==0 else 'along_n'
+      r['split_k_slices'] = kernel.split_k
+      r['use_fast_acc'] = use_fast_acc
+      r['voidC'] = voidC
+      ret.append(r)
+    return ret

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/library.py ADDED Viewed

	@@ -0,0 +1,1531 @@

+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Data types and tags used for emitting CUTLASS C++ kernels
+"""
+import enum
+import re
+# The following block implements enum.auto() for Python 3.5 variants that don't include it such
+# as the default 3.5.2 on Ubuntu 16.04.
+#
+# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility
+try:
+  from enum import auto as enum_auto
+except ImportError:
+  __cutlass_library_auto_enum = 0
+  def enum_auto() -> int:
+    global __cutlass_library_auto_enum
+    i = __cutlass_library_auto_enum
+    __cutlass_library_auto_enum += 1
+    return i
+###################################################################################################
+#
+class GeneratorTarget(enum.Enum):
+  Library = enum_auto()
+#
+GeneratorTargetNames = {
+  GeneratorTarget.Library: 'library'
+}
+#
+###################################################################################################
+#
+class DataType(enum.Enum):
+  void = enum_auto()  # primarily used to disable C tensor for epilogues
+  b1 = enum_auto()
+  u2 = enum_auto()
+  u4 = enum_auto()
+  u8 = enum_auto()
+  u16 = enum_auto()
+  u32 = enum_auto()
+  u64 = enum_auto()
+  s2 = enum_auto()
+  s4 = enum_auto()
+  s8 = enum_auto()
+  s16 = enum_auto()
+  s32 = enum_auto()
+  s64 = enum_auto()
+  e4m3 = enum_auto()
+  e5m2 = enum_auto()
+  f8 = enum_auto()
+  f6 = enum_auto()
+  f4 = enum_auto()
+  e3m2 = enum_auto()
+  e2m3 = enum_auto()
+  e2m1 = enum_auto()
+  ue8m0 = enum_auto()
+  ue4m3 = enum_auto()
+  f16 = enum_auto()
+  bf16 = enum_auto()
+  f32 = enum_auto()
+  tf32 = enum_auto()
+  f64 = enum_auto()
+  cf16 = enum_auto()
+  cbf16 = enum_auto()
+  cf32 = enum_auto()
+  ctf32 = enum_auto()
+  cf64 = enum_auto()
+  cs2 = enum_auto()
+  cs4 = enum_auto()
+  cs8 = enum_auto()
+  cs16 = enum_auto()
+  cs32 = enum_auto()
+  cs64 = enum_auto()
+  cu2 = enum_auto()
+  cu4 = enum_auto()
+  cu8 = enum_auto()
+  cu16 = enum_auto()
+  cu32 = enum_auto()
+  cu64 = enum_auto()
+  invalid = enum_auto()
+#
+ShortDataTypeNames = {
+  DataType.s32: 'i',
+  DataType.e4m3: 'e4m3',
+  DataType.e5m2: 'e5m2',
+  DataType.f16: 'h',
+  DataType.f32: 's',
+  DataType.f64: 'd',
+  DataType.cf32: 'c',
+  DataType.cf64: 'z',
+  DataType.f8: 'f8',
+  DataType.f6: 'f6',
+  DataType.f4: 'f4',
+}
+#
+DataTypeNames = {
+  DataType.void: "void",
+  DataType.b1: "b1",
+  DataType.u2: "u2",
+  DataType.u4: "u4",
+  DataType.u8: "u8",
+  DataType.u16: "u16",
+  DataType.u32: "u32",
+  DataType.u64: "u64",
+  DataType.s2: "s2",
+  DataType.s4: "s4",
+  DataType.s8: "s8",
+  DataType.s16: "s16",
+  DataType.s32: "s32",
+  DataType.s64: "s64",
+  DataType.e4m3: 'e4m3',
+  DataType.e5m2: 'e5m2',
+  DataType.f8: 'f8',
+  DataType.f6: 'f6',
+  DataType.f4: 'f4',
+  DataType.e2m3: 'e2m3',
+  DataType.e3m2: 'e3m2',
+  DataType.e2m1: 'e2m1',
+  DataType.ue8m0: 'ue8m0',
+  DataType.ue4m3: 'ue4m3',
+  DataType.f16: "f16",
+  DataType.bf16: "bf16",
+  DataType.f32: "f32",
+  DataType.tf32: "tf32",
+  DataType.f64: "f64",
+  DataType.cf16: "cf16",
+  DataType.cbf16: "cbf16",
+  DataType.cf32: "cf32",
+  DataType.ctf32: "ctf32",
+  DataType.cf64: "cf64",
+  DataType.cu2: "cu2",
+  DataType.cu4: "cu4",
+  DataType.cu8: "cu8",
+  DataType.cu16: "cu16",
+  DataType.cu32: "cu32",
+  DataType.cu64: "cu64",
+  DataType.cs2: "cs2",
+  DataType.cs4: "cs4",
+  DataType.cs8: "cs8",
+  DataType.cs16: "cs16",
+  DataType.cs32: "cs32",
+  DataType.cs64: "cs64",
+}
+DataTypeTag = {
+  DataType.void: "void",
+  DataType.b1: "cutlass::uint1b_t",
+  DataType.u2: "cutlass::uint2b_t",
+  DataType.u4: "cutlass::uint4b_t",
+  DataType.u8: "uint8_t",
+  DataType.u16: "uint16_t",
+  DataType.u32: "uint32_t",
+  DataType.u64: "uint64_t",
+  DataType.s2: "cutlass::int2b_t",
+  DataType.s4: "cutlass::int4b_t",
+  DataType.s8: "int8_t",
+  DataType.s16: "int16_t",
+  DataType.s32: "int32_t",
+  DataType.s64: "int64_t",
+  DataType.e4m3: 'cutlass::float_e4m3_t',
+  DataType.e5m2: 'cutlass::float_e5m2_t',
+  DataType.f8: 'cutlass::type_erased_dynamic_float8_t',
+  DataType.f6: 'cutlass::type_erased_dynamic_float6_t',
+  DataType.f4: 'cutlass::type_erased_dynamic_float4_t',
+  DataType.e2m3: 'cutlass::float_e2m3_t',
+  DataType.e3m2: 'cutlass::float_e3m2_t',
+  DataType.e2m1: 'cutlass::float_e2m1_t',
+  DataType.ue8m0: 'cutlass::float_ue8m0_t',
+  DataType.ue4m3: 'cutlass::float_ue4m3_t',
+  DataType.f16: "cutlass::half_t",
+  DataType.bf16: "cutlass::bfloat16_t",
+  DataType.f32: "float",
+  DataType.tf32: "cutlass::tfloat32_t",
+  DataType.f64: "double",
+  DataType.cf16: "cutlass::complex<cutlass::half_t>",
+  DataType.cbf16: "cutlass::complex<cutlass::bfloat16_t>",
+  DataType.cf32: "cutlass::complex<float>",
+  DataType.ctf32: "cutlass::complex<cutlass::tfloat32_t>",
+  DataType.cf64: "cutlass::complex<double>",
+  DataType.cu2: "cutlass::complex<cutlass::uint2b_t>",
+  DataType.cu4: "cutlass::complex<cutlass::uint4b_t>",
+  DataType.cu8: "cutlass::complex<cutlass::uint8_t>",
+  DataType.cu16: "cutlass::complex<cutlass::uint16_t>",
+  DataType.cu32: "cutlass::complex<cutlass::uint32_t>",
+  DataType.cu64: "cutlass::complex<cutlass::uint64_t>",
+  DataType.cs2: "cutlass::complex<cutlass::int2b_t>",
+  DataType.cs4: "cutlass::complex<cutlass::int4b_t>",
+  DataType.cs8: "cutlass::complex<cutlass::int8_t>",
+  DataType.cs16: "cutlass::complex<cutlass::int16_t>",
+  DataType.cs32: "cutlass::complex<cutlass::int32_t>",
+  DataType.cs64: "cutlass::complex<cutlass::int64_t>",
+}
+DataTypeSize = {
+  DataType.void: 0,
+  DataType.b1: 1,
+  DataType.u2: 2,
+  DataType.u4: 4,
+  DataType.u8: 8,
+  DataType.u16: 16,
+  DataType.u32: 32,
+  DataType.u64: 64,
+  DataType.s2: 2,
+  DataType.s4: 4,
+  DataType.s8: 8,
+  DataType.s16: 16,
+  DataType.s32: 32,
+  DataType.s64: 64,
+  DataType.e4m3: 8,
+  DataType.e5m2: 8,
+  DataType.f8: 8,
+  DataType.f6: 6,
+  DataType.f4: 4,
+  DataType.e2m3: 6,
+  DataType.e3m2: 6,
+  DataType.e2m1: 4,
+  DataType.ue8m0: 8,
+  DataType.ue4m3: 8,
+  DataType.f16: 16,
+  DataType.bf16: 16,
+  DataType.f32: 32,
+  DataType.tf32: 32,
+  DataType.f64: 64,
+  DataType.cf16: 32,
+  DataType.cbf16: 32,
+  DataType.cf32: 64,
+  DataType.ctf32: 32,
+  DataType.cf64: 128,
+  DataType.cu2: 4,
+  DataType.cu4: 8,
+  DataType.cu8: 16,
+  DataType.cu16: 32,
+  DataType.cu32: 64,
+  DataType.cu64: 128,
+  DataType.cs2: 4,
+  DataType.cs4: 8,
+  DataType.cs8: 16,
+  DataType.cs16: 32,
+  DataType.cs32: 64,
+  DataType.cs64: 128,
+}
+###################################################################################################
+#
+class BlasMode(enum.Enum):
+  symmetric = enum_auto()
+  hermitian = enum_auto()
+#
+BlasModeTag = {
+  BlasMode.symmetric: 'cutlass::BlasMode::kSymmetric',
+  BlasMode.hermitian: 'cutlass::BlasMode::kHermitian',
+}
+#
+class ComplexTransform(enum.Enum):
+  none = enum_auto()
+  conj = enum_auto()
+#
+ComplexTransformTag = {
+  ComplexTransform.none: 'cutlass::ComplexTransform::kNone',
+  ComplexTransform.conj: 'cutlass::ComplexTransform::kConjugate',
+}
+# Used for cutlass3x complex kernel collective mainloop builder instantiation
+ComplexTransformTag3x = {
+  ComplexTransform.none: 'cute::identity',
+  ComplexTransform.conj: 'cute::conjugate',
+}
+#
+RealComplexBijection = [
+  (DataType.f16, DataType.cf16),
+  (DataType.f32, DataType.cf32),
+  (DataType.f64, DataType.cf64),
+]
+#
+def is_complex(data_type):
+  for r, c in RealComplexBijection:
+    if data_type == c:
+      return True
+  return False
+def is_block_scaled(gemm_kind):
+  return gemm_kind in (GemmKind.BlockScaledUniversal3x, GemmKind.GroupedBlockScaledUniversal3x)
+def is_blockwise(gemm_kind):
+  return gemm_kind in (GemmKind.BlockwiseUniversal3x, GemmKind.GroupedBlockwiseUniversal3x)
+def is_grouped(gemm_kind):
+  return gemm_kind in (GemmKind.GroupedUniversal3x,
+    GemmKind.GroupedBlockScaledUniversal3x, GemmKind.GroupedBlockwiseUniversal3x)
+#
+def get_complex_from_real(real_type):
+  for r, c in RealComplexBijection:
+    if real_type == r:
+      return c
+  return DataType.invalid
+#
+def get_real_from_complex(complex_type):
+  for r, c in RealComplexBijection:
+    if complex_type == c:
+      return r
+  return DataType.invalid
+# TMA requires an alignment of 128 bits for all data types
+def get_tma_alignment(data_type):
+  if data_type == DataType.void:
+    return 0
+  elif DataTypeSize[data_type] == 6:
+    return 128 # 96B alignment for 16U6 format
+  else:
+    return 128 // DataTypeSize[data_type]
+#
+class ComplexMultiplyOp(enum.Enum):
+  multiply_add = enum_auto()
+  gaussian = enum_auto()
+###################################################################################################
+#
+class MathOperation(enum.Enum):
+  multiply_add = enum_auto()
+  multiply_add_saturate = enum_auto()
+  multiply_add_mixed_input_upcast = enum_auto()
+  xor_popc = enum_auto()
+  and_popc = enum_auto()
+  multiply_add_fast_bf16 = enum_auto()
+  multiply_add_fast_f16 = enum_auto()
+  multiply_add_fast_f32 = enum_auto()
+  multiply_add_complex_fast_f32 = enum_auto()
+  multiply_add_complex = enum_auto()
+  multiply_add_complex_gaussian = enum_auto()
+  multiply_add_fast_accum = enum_auto()
+#
+MathOperationTag = {
+  MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd',
+  MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate',
+  MathOperation.multiply_add_mixed_input_upcast: 'cutlass::arch::OpMultiplyAddMixedInputUpcast',
+  MathOperation.xor_popc: 'cutlass::arch::OpXorPopc',
+  MathOperation.and_popc: 'cutlass::arch::OpAndPopc',
+  MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16',
+  MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16',
+  MathOperation.multiply_add_fast_f32: 'cutlass::arch::OpMultiplyAddFastF32',
+  MathOperation.multiply_add_complex_fast_f32: 'cutlass::arch::OpMultiplyAddComplexFastF32',
+  MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex',
+  MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex',
+  MathOperation.multiply_add_fast_accum: 'cutlass::arch::OpMultiplyAddFastAccum',
+}
+###################################################################################################
+#
+class LayoutType(enum.Enum):
+  ColumnMajor = enum_auto()
+  RowMajor = enum_auto()
+  ColumnMajorInterleaved2 = enum_auto()
+  RowMajorInterleaved2 = enum_auto()
+  ColumnMajorInterleaved32 = enum_auto()
+  RowMajorInterleaved32 = enum_auto()
+  ColumnMajorInterleaved64 = enum_auto()
+  RowMajorInterleaved64 = enum_auto()
+  TensorNWC = enum_auto()
+  TensorNHWC = enum_auto()
+  TensorNDHWC = enum_auto()
+  TensorNCHW = enum_auto()
+  TensorNGHWC = enum_auto()
+  TensorNC32HW32 = enum_auto()
+  TensorNC64HW64 = enum_auto()
+  TensorC32RSK32 = enum_auto()
+  TensorC64RSK64 = enum_auto()
+  TensorKCS = enum_auto()
+  TensorKCSR = enum_auto()
+  TensorKCSRT = enum_auto()
+#
+LayoutTag = {
+  LayoutType.ColumnMajor: 'cutlass::layout::ColumnMajor',
+  LayoutType.RowMajor: 'cutlass::layout::RowMajor',
+  LayoutType.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>',
+  LayoutType.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>',
+  LayoutType.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>',
+  LayoutType.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>',
+  LayoutType.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>',
+  LayoutType.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>',
+  LayoutType.TensorNWC: 'cutlass::layout::TensorNWC',
+  LayoutType.TensorNHWC: 'cutlass::layout::TensorNHWC',
+  LayoutType.TensorNDHWC: 'cutlass::layout::TensorNDHWC',
+  LayoutType.TensorNCHW: 'cutlass::layout::TensorNCHW',
+  LayoutType.TensorNGHWC: 'cutlass::layout::TensorNGHWC',
+  LayoutType.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>',
+  LayoutType.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>',
+  LayoutType.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>',
+  LayoutType.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>',
+  LayoutType.TensorKCS: 'cutlass::layout::TensorKCS',
+  LayoutType.TensorKCSR: 'cutlass::layout::TensorKCSR',
+  LayoutType.TensorKCSRT: 'cutlass::layout::TensorKCSRT'
+}
+#
+TransposedLayout = {
+  LayoutType.ColumnMajor: LayoutType.RowMajor,
+  LayoutType.RowMajor: LayoutType.ColumnMajor,
+  LayoutType.ColumnMajorInterleaved2: LayoutType.RowMajorInterleaved2,
+  LayoutType.RowMajorInterleaved2: LayoutType.ColumnMajorInterleaved2,
+  LayoutType.ColumnMajorInterleaved32: LayoutType.RowMajorInterleaved32,
+  LayoutType.RowMajorInterleaved32: LayoutType.ColumnMajorInterleaved32,
+  LayoutType.ColumnMajorInterleaved64: LayoutType.RowMajorInterleaved64,
+  LayoutType.RowMajorInterleaved64: LayoutType.ColumnMajorInterleaved64,
+  LayoutType.TensorNHWC: LayoutType.TensorNHWC
+}
+#
+ShortLayoutTypeNames = {
+  LayoutType.ColumnMajor: 'n',
+  LayoutType.ColumnMajorInterleaved2: 'n2',
+  LayoutType.ColumnMajorInterleaved32: 'n32',
+  LayoutType.ColumnMajorInterleaved64: 'n64',
+  LayoutType.RowMajor: 't',
+  LayoutType.RowMajorInterleaved2: 't2',
+  LayoutType.RowMajorInterleaved32: 't32',
+  LayoutType.RowMajorInterleaved64: 't64',
+  LayoutType.TensorNWC: 'nwc',
+  LayoutType.TensorNHWC: 'nhwc',
+  LayoutType.TensorNDHWC: 'ndhwc',
+  LayoutType.TensorNCHW: 'nchw',
+  LayoutType.TensorNGHWC: 'nghwc',
+  LayoutType.TensorNC32HW32: 'nc32hw32',
+  LayoutType.TensorNC64HW64: 'nc64hw64',
+  LayoutType.TensorC32RSK32: 'c32rsk32',
+  LayoutType.TensorC64RSK64: 'c64rsk64',
+  LayoutType.TensorKCS: 'kcs',
+  LayoutType.TensorKCSR: 'kcsr',
+  LayoutType.TensorKCSRT: 'kcsrt'
+}
+#
+ShortComplexLayoutNames = {
+  (LayoutType.ColumnMajor, ComplexTransform.none): 'n',
+  (LayoutType.ColumnMajor, ComplexTransform.conj): 'c',
+  (LayoutType.RowMajor, ComplexTransform.none): 't',
+  (LayoutType.RowMajor, ComplexTransform.conj): 'h'
+}
+###################################################################################################
+class KernelScheduleType(enum.Enum):
+  ScheduleAuto = enum_auto()
+  Multistage = enum_auto()
+  CpAsyncWarpSpecialized = enum_auto()
+  CpAsyncWarpSpecializedPingpong = enum_auto()
+  CpAsyncWarpSpecializedCooperative = enum_auto()
+  Tma = enum_auto()
+  TmaWarpSpecialized = enum_auto()
+  TmaWarpSpecializedPingpong = enum_auto()
+  TmaWarpSpecializedCooperative = enum_auto()
+  TmaWarpSpecializedFP8FastAccum = enum_auto()
+  TmaWarpSpecializedCooperativeFP8FastAccum = enum_auto()
+  TmaWarpSpecializedPingpongFP8FastAccum = enum_auto()
+  ImplicitTmaWarpSpecializedSm90 = enum_auto()
+  PtrArrayTmaWarpSpecializedCooperative = enum_auto()
+  PtrArrayTmaWarpSpecializedCooperativeFP8FastAccum = enum_auto()
+  PtrArrayTmaWarpSpecializedPingpong = enum_auto()
+  PtrArrayTmaWarpSpecializedPingpongFP8FastAccum = enum_auto()
+  BlockwiseTmaWarpSpecializedCooperative = enum_auto()
+  PtrArrayBlockwiseTmaWarpSpecializedCooperative = enum_auto()
+  BlockwiseTmaWarpSpecializedPingpong = enum_auto()
+  PtrArrayBlockwiseTmaWarpSpecializedPingpong = enum_auto()
+  TmaWarpSpecialized1SmSm100 = enum_auto()
+  TmaWarpSpecialized2SmSm100 = enum_auto()
+  ImplicitTmaWarpSpecialized1SmSm100 = enum_auto()
+  ImplicitTmaWarpSpecialized2SmSm100 = enum_auto()
+  PtrArrayTmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayTmaWarpSpecialized2SmSm100 = enum_auto()
+  PtrArrayTmaWarpSpecialized1SmBlockScaledSm100 = enum_auto()
+  PtrArrayTmaWarpSpecialized2SmBlockScaledSm100 = enum_auto()
+  PtrArrayNvf4TmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayNvf4TmaWarpSpecialized2SmSm100 = enum_auto()
+  PtrArrayMxf4TmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayMxf4TmaWarpSpecialized2SmSm100 = enum_auto()
+  PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100 = enum_auto()
+  SparseTmaWarpSpecialized1SmSm100 = enum_auto()
+  SparseTmaWarpSpecialized2SmSm100 = enum_auto()
+  BlockScaledTmaWarpSpecialized1SmSm100 = enum_auto()
+  BlockScaledTmaWarpSpecialized2SmSm100 = enum_auto()
+  Mxf8f6f4TmaWarpSpecialized1SmSm100 = enum_auto()
+  Mxf8f6f4TmaWarpSpecialized2SmSm100 = enum_auto()
+  BlockwiseTmaWarpSpecialized1SmSm100 = enum_auto()
+  BlockwiseTmaWarpSpecialized2SmSm100 = enum_auto()
+  PtrArrayBlockwiseTmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayBlockwiseTmaWarpSpecialized2SmSm100 = enum_auto()
+  Mxf4TmaWarpSpecialized1SmSm100 = enum_auto()
+  Mxf4TmaWarpSpecialized2SmSm100 = enum_auto()
+  Nvf4TmaWarpSpecialized1SmSm100 = enum_auto()
+  Nvf4TmaWarpSpecialized2SmSm100 = enum_auto()
+  # FP4 Ultra
+  MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103 = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103 = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103 = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103 = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103 = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103 = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103 = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103 = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch = enum_auto()
+  Mxf8f6f4TmaWarpSpecializedCooperativeSm120 = enum_auto()
+  Mxf8f6f4TmaWarpSpecializedPingpongSm120 = enum_auto()
+  Nvf4TmaWarpSpecializedCooperativeSm120 = enum_auto()
+  Nvf4TmaWarpSpecializedPingpongSm120 = enum_auto()
+  Mxf4TmaWarpSpecializedCooperativeSm120 = enum_auto()
+  Mxf4TmaWarpSpecializedPingpongSm120 = enum_auto()
+  F8f6f4SparseTmaWarpSpecializedCooperativeSm120 = enum_auto()
+  BlockwiseTmaWarpSpecializedCooperativeSm120 = enum_auto()
+  BlockwiseTmaWarpSpecializedPingpongSm120 = enum_auto()
+KernelScheduleTag = {
+  KernelScheduleType.ScheduleAuto: 'cutlass::gemm::collective::KernelScheduleAuto',
+  KernelScheduleType.Multistage: 'cutlass::gemm::KernelMultistage',
+  KernelScheduleType.CpAsyncWarpSpecialized: 'cutlass::gemm::KernelCpAsyncWarpSpecialized',
+  KernelScheduleType.CpAsyncWarpSpecializedPingpong: 'cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong',
+  KernelScheduleType.CpAsyncWarpSpecializedCooperative: 'cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative',
+  KernelScheduleType.Tma: 'cutlass::gemm::KernelTma',
+  KernelScheduleType.TmaWarpSpecialized: 'cutlass::gemm::KernelTmaWarpSpecialized',
+  KernelScheduleType.TmaWarpSpecializedPingpong: 'cutlass::gemm::KernelTmaWarpSpecializedPingpong',
+  KernelScheduleType.TmaWarpSpecializedCooperative: 'cutlass::gemm::KernelTmaWarpSpecializedCooperative',
+  KernelScheduleType.TmaWarpSpecializedFP8FastAccum: 'cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum',
+  KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum: 'cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum',
+  KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum',
+  KernelScheduleType.ImplicitTmaWarpSpecializedSm90: 'cutlass::conv::KernelImplicitTmaWarpSpecializedSm90',
+  KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative: 'cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8Blockwise',
+  KernelScheduleType.BlockwiseTmaWarpSpecializedPingpong: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8Blockwise',
+  KernelScheduleType.TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmSm100',
+  KernelScheduleType.TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmSm100',
+  KernelScheduleType.ImplicitTmaWarpSpecialized1SmSm100: 'cutlass::conv::KernelImplicitTmaWarpSpecialized1SmSm100',
+  KernelScheduleType.ImplicitTmaWarpSpecialized2SmSm100: 'cutlass::conv::KernelImplicitTmaWarpSpecialized2SmSm100',
+  KernelScheduleType.PtrArrayTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100',
+  KernelScheduleType.PtrArrayTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100',
+  KernelScheduleType.SparseTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100',
+  KernelScheduleType.SparseTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100',
+  KernelScheduleType.BlockScaledTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100',
+  KernelScheduleType.BlockScaledTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100',
+  KernelScheduleType.BlockwiseTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100',
+  KernelScheduleType.BlockwiseTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise2SmSm100',
+  KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmMxf4Sm100',
+  KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmMxf4Sm100',
+  KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100',
+  KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmNvf4Sm100',
+  # FP4 Ultra
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedCooperative: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedCooperativeFP8FastAccum: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedPingpong: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedPingpongFP8FastAccum: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedCooperative: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8Blockwise',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedPingpong: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8Blockwise',
+  KernelScheduleType.PtrArrayTmaWarpSpecialized1SmBlockScaledSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledSm100",
+  KernelScheduleType.PtrArrayTmaWarpSpecialized2SmBlockScaledSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledSm100",
+  KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized1SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100",
+  KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized2SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmNvf4Sm100",
+  KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized1SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100",
+  KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized2SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100",
+  KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100",
+  KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100",
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelTmaWarpSpecializedMxf8f6f4Sm120',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecializedPingpongSm120: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongMxf8f6f4Sm120',
+  KernelScheduleType.Nvf4TmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120',
+  KernelScheduleType.Nvf4TmaWarpSpecializedPingpongSm120: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongNvf4Sm120',
+  KernelScheduleType.Mxf4TmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelTmaWarpSpecializedMxf4Sm120',
+  KernelScheduleType.Mxf4TmaWarpSpecializedPingpongSm120: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongMxf4Sm120',
+  KernelScheduleType.F8f6f4SparseTmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelScheduleSparseF8f6f4Sm120',
+  KernelScheduleType.BlockwiseTmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelTmaWarpSpecializedBlockwiseCooperativeSm120',
+  KernelScheduleType.BlockwiseTmaWarpSpecializedPingpongSm120: 'cutlass::gemm::KernelTmaWarpSpecializedBlockwisePingpongSm120',
+}
+#
+KernelScheduleSuffixes = {
+  KernelScheduleType.ScheduleAuto: '',
+  KernelScheduleType.Multistage: '_cpasync',
+  KernelScheduleType.CpAsyncWarpSpecialized: '_cpasync_warpspecialized',
+  KernelScheduleType.CpAsyncWarpSpecializedPingpong: '_cpasync_warpspecialized_pingpong',
+  KernelScheduleType.CpAsyncWarpSpecializedCooperative: '_cpasync_warpspecialized_cooperative',
+  KernelScheduleType.Tma: '_unspecialized',
+  KernelScheduleType.TmaWarpSpecialized: '_warpspecialized',
+  KernelScheduleType.TmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
+  KernelScheduleType.TmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
+  KernelScheduleType.TmaWarpSpecializedFP8FastAccum: '_warpspecialized_fp8_fastaccum',
+  KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum: '_warpspecialized_cooperative_fp8_fastaccum',
+  KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum: '_warpspecialized_pingpong_fp8_fastaccum',
+  KernelScheduleType.ImplicitTmaWarpSpecializedSm90: '_warpspecialized',
+  KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
+  KernelScheduleType.BlockwiseTmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
+  KernelScheduleType.TmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.TmaWarpSpecialized2SmSm100: '_2sm',
+  KernelScheduleType.ImplicitTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.ImplicitTmaWarpSpecialized2SmSm100: '_2sm',
+  KernelScheduleType.PtrArrayTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.PtrArrayTmaWarpSpecialized2SmSm100: '_2sm',
+  KernelScheduleType.SparseTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.SparseTmaWarpSpecialized2SmSm100: '_2sm',
+  KernelScheduleType.BlockScaledTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.BlockScaledTmaWarpSpecialized2SmSm100: '_2sm',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100: '_q_1sm',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100: '_q_2sm',
+  KernelScheduleType.BlockwiseTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.BlockwiseTmaWarpSpecialized2SmSm100: '_2sm',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized2SmSm100: '_2sm',
+  KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100: '_o_vs32_1sm',
+  KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100: '_o_vs32_2sm',
+  KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100: '_o_vs16_1sm',
+  KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100: '_o_vs16_2sm',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103: '_o_vs16_ultra_1sm',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103: '_o_vs16_ultra_2sm',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103: '_o_vs32_ultra_1sm',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103: '_o_vs32_ultra_2sm',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch: '_o_vs16_ultra_1sm_nopf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch: '_o_vs16_ultra_2sm_nopf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch: '_o_vs32_ultra_1sm_nopf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch: '_o_vs32_ultra_2sm_nopf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch: '_o_vs16_ultra_1sm_tmapf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch: '_o_vs16_ultra_2sm_tmapf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch: '_o_vs32_ultra_1sm_tmapf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch: '_o_vs32_ultra_2sm_tmapf',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedCooperativeFP8FastAccum: '_warpspecialized_cooperative_fp8_fastaccum',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedPingpongFP8FastAccum: '_warpspecialized_pingpong_fp8_fastaccum',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
+  KernelScheduleType.PtrArrayTmaWarpSpecialized1SmBlockScaledSm100: '_1sm',
+  KernelScheduleType.PtrArrayTmaWarpSpecialized2SmBlockScaledSm100: '_2sm',
+  KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized1SmSm100: '_o_vs16_1sm',
+  KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized2SmSm100: '_o_vs16_2sm',
+  KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized1SmSm100: '_o_vs32_1sm',
+  KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized2SmSm100: '_o_vs32_2sm',
+  KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100: '_o_vs32_1sm',
+  KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100: '_o_vs32_2sm',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103: '_o_vs16_ultra_1sm',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103: '_o_vs16_ultra_2sm',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103: '_o_vs32_ultra_1sm',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103: '_o_vs32_ultra_2sm',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch: '_o_vs16_ultra_1sm_nopf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch: '_o_vs16_ultra_2sm_nopf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch: '_o_vs32_ultra_1sm_nopf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch: '_o_vs32_ultra_2sm_nopf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch: '_o_vs16_ultra_1sm_tmapf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch: '_o_vs16_ultra_2sm_tmapf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch: '_o_vs32_ultra_1sm_tmapf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch: '_o_vs32_ultra_2sm_tmapf',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecializedCooperativeSm120: '_cooperative_q',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecializedPingpongSm120: '_pingpong_q',
+  KernelScheduleType.Nvf4TmaWarpSpecializedCooperativeSm120: '_cooperative_o_vs16',
+  KernelScheduleType.Nvf4TmaWarpSpecializedPingpongSm120: '_pingpong_o_vs16',
+  KernelScheduleType.Mxf4TmaWarpSpecializedCooperativeSm120: '_cooperative_o_vs32',
+  KernelScheduleType.Mxf4TmaWarpSpecializedPingpongSm120: '_pingpong_o_vs32',
+  KernelScheduleType.F8f6f4SparseTmaWarpSpecializedCooperativeSm120: '_q',
+  KernelScheduleType.BlockwiseTmaWarpSpecializedCooperativeSm120: '_cooperative_q',
+  KernelScheduleType.BlockwiseTmaWarpSpecializedPingpongSm120: '_pingpong_q'
+}
+class EpilogueScheduleType(enum.Enum):
+  ScheduleAuto = enum_auto()
+  EpilogueTransposed = enum_auto()
+  NoSmemWarpSpecialized = enum_auto()
+  PtrArrayNoSmemWarpSpecialized = enum_auto()
+  NoSmemWarpSpecialized1Sm = enum_auto()
+  NoSmemWarpSpecialized2Sm = enum_auto()
+  FastF32NoSmemWarpSpecialized1Sm = enum_auto()
+  FastF32NoSmemWarpSpecialized2Sm = enum_auto()
+  BlockwiseNoSmemWarpSpecialized1Sm = enum_auto()
+  BlockwiseNoSmemWarpSpecialized2Sm = enum_auto()
+  PtrArrayNoSmemWarpSpecialized1Sm = enum_auto()
+  PtrArrayNoSmemWarpSpecialized2Sm = enum_auto()
+  PtrArrayFastF32NoSmemWarpSpecialized1Sm = enum_auto()
+  PtrArrayFastF32NoSmemWarpSpecialized2Sm = enum_auto()
+  PtrArrayBlockwiseNoSmemWarpSpecialized1Sm = enum_auto()
+  PtrArrayBlockwiseNoSmemWarpSpecialized2Sm = enum_auto()
+  TmaWarpSpecialized = enum_auto()
+  TmaWarpSpecializedCooperative = enum_auto()
+  TmaWarpSpecialized1Sm = enum_auto()
+  TmaWarpSpecialized2Sm = enum_auto()
+  PtrArrayTmaWarpSpecialized1Sm = enum_auto()
+  PtrArrayTmaWarpSpecialized2Sm = enum_auto()
+  PtrArrayTmaWarpSpecializedPingpong = enum_auto()
+  PtrArrayTmaWarpSpecializedCooperative = enum_auto()
+#
+EpilogueScheduleTag = {
+  EpilogueScheduleType.ScheduleAuto: 'cutlass::epilogue::collective::EpilogueScheduleAuto',
+  EpilogueScheduleType.EpilogueTransposed: 'cutlass::gemm::EpilogueTransposed',
+  EpilogueScheduleType.NoSmemWarpSpecialized: 'cutlass::epilogue::NoSmemWarpSpecialized',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized: 'cutlass::epilogue::PtrArrayNoSmemWarpSpecialized',
+  EpilogueScheduleType.NoSmemWarpSpecialized1Sm: 'cutlass::epilogue::NoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.NoSmemWarpSpecialized2Sm: 'cutlass::epilogue::NoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.FastF32NoSmemWarpSpecialized1Sm: 'cutlass::epilogue::FastF32NoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.FastF32NoSmemWarpSpecialized2Sm: 'cutlass::epilogue::FastF32NoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized1Sm: 'cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized2Sm: 'cutlass::epilogue::BlockwiseNoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized1Sm: 'cutlass::epilogue::PtrArrayNoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized2Sm: 'cutlass::epilogue::PtrArrayNoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.PtrArrayFastF32NoSmemWarpSpecialized1Sm: 'cutlass::epilogue::PtrArrayFastF32NoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.PtrArrayFastF32NoSmemWarpSpecialized2Sm: 'cutlass::epilogue::PtrArrayFastF32NoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.PtrArrayBlockwiseNoSmemWarpSpecialized1Sm: 'cutlass::epilogue::PtrArrayBlockwiseNoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.PtrArrayBlockwiseNoSmemWarpSpecialized2Sm: 'cutlass::epilogue::PtrArrayBlockwiseNoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.TmaWarpSpecialized: 'cutlass::epilogue::TmaWarpSpecialized',
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: 'cutlass::epilogue::TmaWarpSpecializedCooperative',
+  EpilogueScheduleType.TmaWarpSpecialized1Sm: 'cutlass::epilogue::TmaWarpSpecialized1Sm',
+  EpilogueScheduleType.TmaWarpSpecialized2Sm: 'cutlass::epilogue::TmaWarpSpecialized2Sm',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm: 'cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm: 'cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecializedCooperative: 'cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecializedPingpong: 'cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong',
+}
+#
+EpilogueScheduleSuffixes = {
+  EpilogueScheduleType.ScheduleAuto: '',
+  EpilogueScheduleType.EpilogueTransposed: '',
+  EpilogueScheduleType.NoSmemWarpSpecialized: '_epi_nosmem',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized: '_epi_nosmem',
+  EpilogueScheduleType.NoSmemWarpSpecialized1Sm: '_epi_nosmem',
+  EpilogueScheduleType.NoSmemWarpSpecialized2Sm: '_epi_nosmem',
+  EpilogueScheduleType.FastF32NoSmemWarpSpecialized1Sm: '_epi_nosmem_fastf32',
+  EpilogueScheduleType.FastF32NoSmemWarpSpecialized2Sm: '_epi_nosmem_fastf32',
+  EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized1Sm: '_epi_nosmem',
+  EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized2Sm: '_epi_nosmem',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized1Sm: '_epi_nosmem',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized2Sm: '_epi_nosmem',
+  EpilogueScheduleType.PtrArrayFastF32NoSmemWarpSpecialized1Sm: '_epi_nosmem_fastf32',
+  EpilogueScheduleType.PtrArrayFastF32NoSmemWarpSpecialized2Sm: '_epi_nosmem_fastf32',
+  EpilogueScheduleType.PtrArrayBlockwiseNoSmemWarpSpecialized1Sm: '_epi_nosmem',
+  EpilogueScheduleType.PtrArrayBlockwiseNoSmemWarpSpecialized2Sm: '_epi_nosmem',
+  EpilogueScheduleType.TmaWarpSpecialized: '_epi_tma',
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: '_epi_tma',
+  EpilogueScheduleType.TmaWarpSpecialized1Sm: '',
+  EpilogueScheduleType.TmaWarpSpecialized2Sm: '_epi_tma',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm: '',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm: '_epi_tma',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecializedCooperative: '_epi_tma',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecializedPingpong: '_epi_tma',
+}
+class EpilogueFunctor3x(enum.Enum):
+  LinearCombination = enum_auto()
+  LinearCombinationBlockScaleFactor = enum_auto()
+#
+EpilogueFunctor3xTag = {
+  EpilogueFunctor3x.LinearCombination: 'cutlass::epilogue::fusion::LinearCombination',
+  EpilogueFunctor3x.LinearCombinationBlockScaleFactor: 'cutlass::epilogue::fusion::LinCombBlockScaleFactor',
+}
+# TMA epilogues have certain alignment requirements as calculated in get_tma_alignment(data_type)
+def is_tma_epilogue(epilogue_schedule_type):
+  return epilogue_schedule_type in [
+    EpilogueScheduleType.ScheduleAuto,
+    EpilogueScheduleType.TmaWarpSpecialized,
+    EpilogueScheduleType.TmaWarpSpecializedCooperative,
+    EpilogueScheduleType.TmaWarpSpecialized1Sm,
+    EpilogueScheduleType.TmaWarpSpecialized2Sm,
+    EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm,
+    EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm,
+    EpilogueScheduleType.PtrArrayTmaWarpSpecializedCooperative,
+    EpilogueScheduleType.PtrArrayTmaWarpSpecializedPingpong,
+  ]
+def to_grouped_schedule(schedule, grouped):
+  if not grouped:
+    return schedule
+  group_schedule_map = {
+    # SM90
+    KernelScheduleType.TmaWarpSpecializedCooperative : KernelScheduleType.PtrArrayTmaWarpSpecializedCooperative,
+    KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative : KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedCooperative,
+    KernelScheduleType.BlockwiseTmaWarpSpecializedPingpong : KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedPingpong,
+    KernelScheduleType.TmaWarpSpecializedPingpong    : KernelScheduleType.PtrArrayTmaWarpSpecializedPingpong,
+    KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum : KernelScheduleType.PtrArrayTmaWarpSpecializedCooperativeFP8FastAccum,
+    KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum    : KernelScheduleType.PtrArrayTmaWarpSpecializedPingpongFP8FastAccum,
+    EpilogueScheduleType.TmaWarpSpecialized            : EpilogueScheduleType.PtrArrayTmaWarpSpecializedPingpong,
+    EpilogueScheduleType.TmaWarpSpecializedCooperative : EpilogueScheduleType.PtrArrayTmaWarpSpecializedCooperative,
+    EpilogueScheduleType.NoSmemWarpSpecialized         : EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized,
+    # SM100
+    KernelScheduleType.TmaWarpSpecialized1SmSm100: KernelScheduleType.PtrArrayTmaWarpSpecialized1SmSm100,
+    KernelScheduleType.TmaWarpSpecialized2SmSm100: KernelScheduleType.PtrArrayTmaWarpSpecialized2SmSm100,
+    KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100 : KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized1SmSm100,
+    KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized2SmSm100,
+    KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100 : KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized1SmSm100,
+    KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized2SmSm100,
+    KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100 : KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100,
+    KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100,
+    KernelScheduleType.BlockwiseTmaWarpSpecialized1SmSm100 : KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized1SmSm100,
+    KernelScheduleType.BlockwiseTmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized2SmSm100,
+    EpilogueScheduleType.TmaWarpSpecialized1Sm: EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm,
+    EpilogueScheduleType.TmaWarpSpecialized2Sm: EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm,
+    EpilogueScheduleType.NoSmemWarpSpecialized1Sm: EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized1Sm,
+    EpilogueScheduleType.NoSmemWarpSpecialized2Sm: EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized2Sm,
+    EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized1Sm: EpilogueScheduleType.PtrArrayBlockwiseNoSmemWarpSpecialized1Sm,
+    EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized2Sm: EpilogueScheduleType.PtrArrayBlockwiseNoSmemWarpSpecialized2Sm,
+    # SM103
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch,
+  }
+  return group_schedule_map[schedule]
+class TileSchedulerType(enum.Enum):
+  Default = enum_auto()
+  Persistent = enum_auto()
+  StreamK = enum_auto()
+#
+TileSchedulerTag = {
+  TileSchedulerType.Default: 'void',
+  TileSchedulerType.Persistent: 'cutlass::gemm::PersistentScheduler',
+  TileSchedulerType.StreamK: 'cutlass::gemm::StreamKScheduler',
+}
+#
+TileSchedulerSuffixes = {
+  TileSchedulerType.Default: '',
+  TileSchedulerType.Persistent: '',
+  TileSchedulerType.StreamK: '_stream_k',
+}
+###################################################################################################
+#
+class SideMode(enum.Enum):
+  Left = enum_auto()
+  Right = enum_auto()
+#
+SideModeTag = {
+  SideMode.Left: 'cutlass::SideMode::kLeft',
+  SideMode.Right: 'cutlass::SideMode::kRight'
+}
+#
+ShortSideModeNames = {
+  SideMode.Left: 'ls',
+  SideMode.Right: 'rs'
+}
+###################################################################################################
+#
+class FillMode(enum.Enum):
+  Lower = enum_auto()
+  Upper = enum_auto()
+#
+FillModeTag = {
+  FillMode.Lower: 'cutlass::FillMode::kLower',
+  FillMode.Upper: 'cutlass::FillMode::kUpper'
+}
+#
+ShortFillModeNames = {
+  FillMode.Lower: 'l',
+  FillMode.Upper: 'u'
+}
+###################################################################################################
+#
+class DiagType(enum.Enum):
+  NonUnit = enum_auto()
+  Unit = enum_auto()
+#
+DiagTypeTag = {
+  DiagType.NonUnit: 'cutlass::DiagType::kNonUnit',
+  DiagType.Unit: 'cutlass::DiagType::kUnit'
+}
+#
+ShortDiagTypeNames = {
+  DiagType.NonUnit: 'nu',
+  DiagType.Unit: 'un'
+}
+###################################################################################################
+#
+class OpcodeClass(enum.Enum):
+  Simt = enum_auto()
+  TensorOp = enum_auto()
+  WmmaTensorOp = enum_auto()
+  SparseTensorOp = enum_auto()
+  BlockScaledTensorOp = enum_auto()
+OpcodeClassNames = {
+  OpcodeClass.Simt: 'simt',
+  OpcodeClass.TensorOp: 'tensorop',
+  OpcodeClass.WmmaTensorOp: 'wmma_tensorop',
+  OpcodeClass.SparseTensorOp: 'sptensorop',
+  OpcodeClass.BlockScaledTensorOp: 'bstensorop'
+}
+OpcodeClassTag = {
+  OpcodeClass.Simt: 'cutlass::arch::OpClassSimt',
+  OpcodeClass.TensorOp: 'cutlass::arch::OpClassTensorOp',
+  OpcodeClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp',
+  OpcodeClass.SparseTensorOp: 'cutlass::arch::OpClassSparseTensorOp',
+  OpcodeClass.BlockScaledTensorOp: 'cutlass::arch::OpClassBlockScaledTensorOp'
+}
+###################################################################################################
+#
+class OperationKind(enum.Enum):
+  Gemm = enum_auto()
+  RankK = enum_auto()
+  Rank2K = enum_auto()
+  Trmm = enum_auto()
+  Symm = enum_auto()
+  Conv2d = enum_auto()
+  Conv3d = enum_auto()
+#
+OperationKindNames = {
+  OperationKind.Gemm: 'gemm'
+  , OperationKind.RankK: 'rank_k'
+  , OperationKind.Rank2K: 'rank_2k'
+  , OperationKind.Trmm: 'trmm'
+  , OperationKind.Symm: 'symm'
+  , OperationKind.Conv2d: 'conv2d'
+  , OperationKind.Conv3d: 'conv3d'
+}
+#
+class Target(enum.Enum):
+  library = enum_auto()
+#
+ArchitectureNames = {
+  50: 'maxwell',
+  60: 'pascal',
+  61: 'pascal',
+  70: 'volta',
+  75: 'turing',
+  80: 'ampere',
+  89: 'ada',
+  90: 'hopper'
+}
+#
+SharedMemPerCC = {
+  70:   96, #  96KB of SMEM
+  72:   96, #  96KB of SMEM
+  75:   64, #  64KB of SMEM
+  80:  163, # 163KB of SMEM - 1KB reserved for the driver
+  86:   99, #  99KB of SMEM - 1KB reserved for the driver
+  87:  163, # 163KB of SMEM - 1KB reserved for the driver
+  89:   99, #  99KB of SMEM - 1KB reserved for the driver
+  90:  227, # 227KB of SMEM - 1KB reserved for the driver
+  100: 227, # 227KB of SMEM - 1KB reserved for the driver
+}
+###################################################################################################
+#
+def SubstituteTemplate(template, values):
+  text = template
+  changed = True
+  while changed:
+    changed = False
+    for key, value in values.items():
+      regex = "\\$\\{%s\\}" % key
+      newtext = re.sub(regex, value, text)
+      if newtext != text:
+        changed = True
+      text = newtext
+  return text
+###################################################################################################
+#
+class GemmKind(enum.Enum):
+  Gemm = enum_auto()
+  Sparse = enum_auto()
+  Universal = enum_auto()
+  Universal3x = enum_auto()
+  SparseUniversal3x = enum_auto()
+  PlanarComplex = enum_auto()
+  PlanarComplexArray = enum_auto()
+  Grouped = enum_auto()
+  BlockScaledUniversal3x = enum_auto()
+  GroupedUniversal3x = enum_auto()
+  GroupedBlockScaledUniversal3x = enum_auto()
+  BlockwiseUniversal3x = enum_auto()
+  GroupedBlockwiseUniversal3x = enum_auto()
+#
+GemmKindNames = {
+  GemmKind.Gemm: "gemm",
+  GemmKind.Sparse: "spgemm",
+  GemmKind.Universal: "gemm",
+  GemmKind.Universal3x: "gemm",
+  GemmKind.SparseUniversal3x: "spgemm",
+  GemmKind.PlanarComplex: "gemm_planar_complex",
+  GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
+  GemmKind.Grouped: "gemm_grouped",
+  GemmKind.BlockScaledUniversal3x: "gemm",
+  GemmKind.GroupedUniversal3x: "gemm_grouped",
+  GemmKind.GroupedBlockScaledUniversal3x: "gemm_grouped",
+  GemmKind.BlockwiseUniversal3x: "gemm",
+  GemmKind.GroupedBlockwiseUniversal3x: "gemm_grouped"
+}
+#
+class RankKKind(enum.Enum):
+  Universal = enum_auto()
+#
+RankKKindNames = {
+  RankKKind.Universal: "rank_k"
+}
+#
+class TrmmKind(enum.Enum):
+  Universal = enum_auto()
+#
+TrmmKindNames = {
+  TrmmKind.Universal: "trmm"
+}
+#
+class SymmKind(enum.Enum):
+  Universal = enum_auto()
+#
+SymmKindNames = {
+  SymmKind.Universal: "symm"
+}
+#
+class EpilogueFunctor(enum.Enum):
+  LinearCombination = enum_auto()
+  LinearCombinationClamp = enum_auto()
+#
+EpilogueFunctorTag = {
+  EpilogueFunctor.LinearCombination: 'cutlass::epilogue::thread::LinearCombination',
+  EpilogueFunctor.LinearCombinationClamp: 'cutlass::epilogue::thread::LinearCombinationClamp',
+}
+#
+class MixedInputMode(enum.Enum):
+  ConvertOnly = enum_auto()
+  ScaleOnly = enum_auto()
+  ScaleWithZeroPoint = enum_auto()
+#
+class SwizzlingFunctor(enum.Enum):
+  Identity1 = enum_auto()
+  Identity2 = enum_auto()
+  Identity4 = enum_auto()
+  Identity8 = enum_auto()
+  Horizontal = enum_auto()
+  StridedDgradIdentity1 = enum_auto()
+  StridedDgradIdentity4 = enum_auto()
+  StridedDgradHorizontal = enum_auto()
+  StreamK = enum_auto()
+#
+SwizzlingFunctorTag = {
+  SwizzlingFunctor.Identity1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>',
+  SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>',
+  SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>',
+  SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>',
+  SwizzlingFunctor.Horizontal: 'cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle',
+  SwizzlingFunctor.StridedDgradIdentity1: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>',
+  SwizzlingFunctor.StridedDgradIdentity4: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>',
+  SwizzlingFunctor.StridedDgradHorizontal: 'cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle',
+  SwizzlingFunctor.StreamK: 'cutlass::gemm::threadblock::ThreadblockSwizzleStreamK',
+}
+#
+class GroupScheduleMode(enum.Enum):
+  Device = enum_auto(),
+  Host = enum_auto()
+#
+GroupScheduleModeTag = {
+  GroupScheduleMode.Device: 'cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly',
+  GroupScheduleMode.Host: 'cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute'
+}
+#
+ShortGroupScheduleModeNames = {
+  GroupScheduleMode.Device: 'Device',
+  GroupScheduleMode.Host: 'Host'
+}
+###################################################################################################
+#
+class ConvKind(enum.IntEnum):
+  Fprop = 0
+  Dgrad = 1
+  Wgrad = 2
+#
+ConvKindTag = {
+  ConvKind.Fprop: 'cutlass::conv::Operator::kFprop',
+  ConvKind.Dgrad: 'cutlass::conv::Operator::kDgrad',
+  ConvKind.Wgrad: 'cutlass::conv::Operator::kWgrad'
+}
+ConvKindNames = {
+  ConvKind.Fprop: 'fprop',
+  ConvKind.Dgrad: 'dgrad',
+  ConvKind.Wgrad: 'wgrad',
+}
+class ConvMode(enum.IntEnum):
+  CrossCorrelation = 0
+  Convolution = 1
+#
+class IteratorAlgorithm(enum.Enum):
+  Analytic = 0
+  Optimized = 1
+  FixedChannels = 2
+  FewChannels = 3
+  FixedStrideDilation = 4
+#
+IteratorAlgorithmTag = {
+  IteratorAlgorithm.Analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic',
+  IteratorAlgorithm.Optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized',
+  IteratorAlgorithm.FixedChannels: 'cutlass::conv::IteratorAlgorithm::kFixedChannels',
+  IteratorAlgorithm.FewChannels: 'cutlass::conv::IteratorAlgorithm::kFewChannels',
+  IteratorAlgorithm.FixedStrideDilation: 'cutlass::conv::IteratorAlgorithm::kFixedStrideDilation'
+}
+IteratorAlgorithmNames = {
+  IteratorAlgorithm.Analytic: 'analytic',
+  IteratorAlgorithm.Optimized: 'optimized',
+  IteratorAlgorithm.FixedChannels: 'fixed_channels',
+  IteratorAlgorithm.FewChannels: 'few_channels',
+  IteratorAlgorithm.FixedStrideDilation: 'fixed_stride_dilation'
+}
+#
+class StrideSupport(enum.Enum):
+  Strided = 0
+  Unity = 1
+  Fixed = 2
+#
+StrideSupportTag = {
+  StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided',
+  StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity',
+  StrideSupport.Fixed: 'cutlass::conv::StrideSupport::kFixed'
+}
+StrideSupportNames = {
+  StrideSupport.Strided: '',
+  StrideSupport.Unity: 'unity_stride',
+  StrideSupport.Fixed: 'fixed_stride'
+}
+#
+class GroupMode(enum.Enum):
+  NoneGroup = enum_auto()         # dense conv (G=1)
+  SingleGroup = enum_auto()       # grouped convolution (single group per CTA)
+  MultipleGroup = enum_auto()     # grouped convolution ( multiple groups per CTA)
+  Depthwise = enum_auto()         # Depthwise convolution ( C=K=G )
+#
+GroupModeTag = {
+  GroupMode.NoneGroup: 'cutlass::conv::GroupMode::kNone',
+  GroupMode.SingleGroup: 'cutlass::conv::GroupMode::kSingleGroup',
+  GroupMode.MultipleGroup: 'cutlass::conv::GroupMode::kMultipleGroup',
+  GroupMode.Depthwise: 'cutlass::conv::GroupMode::kDepthwise',
+}
+GroupModeNames = {
+  GroupMode.NoneGroup: '',
+  GroupMode.SingleGroup: 'single_group',
+  GroupMode.MultipleGroup: 'multiple_group',
+  GroupMode.Depthwise: 'depthwise',
+}
+DynamicClusterShape = [0, 0, 1]
+###################################################################################################
+#
+class MathInstruction:
+  def __init__(self,
+      instruction_shape,                                            \
+      element_a, element_b, element_accumulator,                    \
+      opcode_class, math_operation = MathOperation.multiply_add     \
+      , element_scale_factor = None
+    ):
+    self.instruction_shape = instruction_shape
+    self.element_a = element_a
+    self.element_b = element_b
+    self.element_accumulator = element_accumulator
+    self.opcode_class = opcode_class
+    self.math_operation = math_operation
+    self.element_scale_factor = element_scale_factor
+#
+class TileDescription:
+  def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute, cluster_shape = [1,1,1], explicit_vector_sizes = None):
+    self.threadblock_shape = threadblock_shape
+    self.tile_shape = threadblock_shape
+    self.stages = stages
+    self.warp_count = warp_count
+    self.math_instruction = math_instruction
+    self.minimum_compute_capability = min_compute
+    self.maximum_compute_capability = max_compute
+    self.cluster_shape = cluster_shape
+    self.explicit_vector_sizes = explicit_vector_sizes
+  def procedural_name(self):
+    if self.minimum_compute_capability >= 90:
+      return "{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{s}".format(
+        tbm = self.threadblock_shape[0],
+        tbn = self.threadblock_shape[1],
+        tbk = self.threadblock_shape[2],
+        cm = self.cluster_shape[0],
+        cn = self.cluster_shape[1],
+        ck = self.cluster_shape[2],
+        s = self.stages)
+    else:
+      return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages)
+#
+class Direct2dConvFixedStrideDilationTileDescription:
+  def __init__(self, threadblock_output_shape, filter_shape, stages, stride, dilation, warp_count, math_instruction, min_compute, max_compute):
+    self.threadblock_shape = [threadblock_output_shape[0]*threadblock_output_shape[1]*threadblock_output_shape[2], threadblock_output_shape[3], filter_shape[0]*filter_shape[1]]
+    self.threadblock_output_shape = threadblock_output_shape
+    self.filter_shape = filter_shape
+    self.stages = stages
+    self.warp_count = warp_count
+    self.stride = stride
+    self.dilation =  dilation
+    self.math_instruction = math_instruction
+    self.minimum_compute_capability = min_compute
+    self.maximum_compute_capability = max_compute
+  def procedural_name(self):
+    str_name = "%dx%dx%d_%dx%dx%dx%d_%d_filter%dx%d" % (self.threadblock_shape[0],
+                                      self.threadblock_shape[1],
+                                      self.threadblock_shape[2],
+                                      self.threadblock_output_shape[0],
+                                      self.threadblock_output_shape[1],
+                                      self.threadblock_output_shape[2],
+                                      self.threadblock_output_shape[3],
+                                      self.stages,
+                                      self.filter_shape[0],
+                                      self.filter_shape[1])
+    # Fixed Strided and dilation
+    if self.stride != [-1, -1] and self.dilation != [-1, -1]:
+      str_name += "_stride%dx%d_dilation%dx%d" % (self.stride[0],
+                                                  self.stride[1],
+                                                  self.dilation[0],
+                                                  self.dilation[1])
+    return str_name
+#
+class Direct2dConvFixedStrideDilationTileDescription:
+  def __init__(self, threadblock_output_shape, filter_shape, stages, stride, dilation, warp_count, math_instruction, min_compute, max_compute):
+    self.threadblock_shape = [threadblock_output_shape[0]*threadblock_output_shape[1]*threadblock_output_shape[2], threadblock_output_shape[3], filter_shape[0]*filter_shape[1]]
+    self.threadblock_output_shape = threadblock_output_shape
+    self.filter_shape = filter_shape
+    self.stages = stages
+    self.warp_count = warp_count
+    self.stride = stride
+    self.dilation =  dilation
+    self.math_instruction = math_instruction
+    self.minimum_compute_capability = min_compute
+    self.maximum_compute_capability = max_compute
+  def procedural_name(self):
+    str_name = "%dx%dx%d_%dx%dx%dx%d_%d_filter%dx%d" % (self.threadblock_shape[0],
+                                      self.threadblock_shape[1],
+                                      self.threadblock_shape[2],
+                                      self.threadblock_output_shape[0],
+                                      self.threadblock_output_shape[1],
+                                      self.threadblock_output_shape[2],
+                                      self.threadblock_output_shape[3],
+                                      self.stages,
+                                      self.filter_shape[0],
+                                      self.filter_shape[1])
+    # Fixed Strided and dilation
+    if self.stride != [-1, -1] and self.dilation != [-1, -1]:
+      str_name += "_stride%dx%d_dilation%dx%d" % (self.stride[0],
+                                                  self.stride[1],
+                                                  self.dilation[0],
+                                                  self.dilation[1])
+    return str_name
+#
+class TensorDescription:
+  def __init__(self, element, layout, alignment = 1, complex_transform = ComplexTransform.none):
+    self.element = element
+    self.layout = layout
+    self.alignment = alignment
+    self.complex_transform = complex_transform
+#
+class SymmetricTensorDescription:
+  def __init__(self, element, layout, fill_mode, alignment = 1, complex_transform = ComplexTransform.none, side_mode = SideMode.Left):
+    self.element = element
+    self.layout = layout
+    self.fill_mode = fill_mode
+    self.alignment = alignment
+    self.complex_transform = complex_transform
+    self.side_mode = side_mode
+#
+class TriangularTensorDescription:
+  def __init__(self, element, layout, side_mode, fill_mode, diag_type, alignment = 1, complex_transform = ComplexTransform.none):
+    self.element = element
+    self.layout = layout
+    self.side_mode = side_mode
+    self.fill_mode = fill_mode
+    self.diag_type = diag_type
+    self.alignment = alignment
+    self.complex_transform = complex_transform
+#
+def CalculateSmemUsage(operation):
+  cta_shape = operation.tile_description.threadblock_shape
+  stages = operation.tile_description.stages
+  if operation.operation_kind == OperationKind.Gemm and operation.gemm_kind == GemmKind.Sparse:
+    # Elements represented by 8 bits of metadata (based on 4:8, 2:4 or 1:2 sparsity)
+    if DataTypeSize[operation.A.element] == 32:
+      elements_per_8b_md = 2
+    elif DataTypeSize[operation.A.element] == 4:
+      elements_per_8b_md = 8
+    else:
+      elements_per_8b_md = 4
+    smem_per_stage = DataTypeSize[operation.A.element] * cta_shape[0] * (cta_shape[2] // 2) // 8 + \
+                     DataTypeSize[operation.B.element] * cta_shape[1] * cta_shape[2] // 8 + \
+                     cta_shape[0] * (cta_shape[2] // 2) // elements_per_8b_md
+  else:
+    # Few BLAS3 operations only have A tensor
+    data_type_size_a = DataTypeSize[operation.A.element]
+    data_type_size_b = DataTypeSize[operation.A.element]
+    if operation.is_mixed_input():
+      data_type_size_b = DataTypeSize[operation.B.element]
+    smem_per_stage = data_type_size_a * cta_shape[0] * cta_shape[2] // 8 + \
+                     data_type_size_b * cta_shape[1] * cta_shape[2] // 8
+  smem_usage = smem_per_stage * stages
+  return (smem_usage >> 10)
+class GemmUniversalMode(enum.IntEnum):
+  """
+  Types corresponding to GemmUniversalMode
+  """
+  Gemm = 0
+  GemmSplitKParallel = 1
+  Batched = 2
+  Array = 3
+class SplitKMode(enum.IntEnum):
+  """
+  Types corresponding to SplitKMode
+  """
+  NoneSplitK = 0
+  Serial = 1
+  Parallel = 2

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/manifest.py ADDED Viewed

	@@ -0,0 +1,868 @@

+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for filtering CUTLASS library kernels and emitting library intitialization
+and building code
+"""
+import enum
+import logging
+import os.path
+import shutil
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.gemm_operation import *
+  from cutlass_library.rank_k_operation import *
+  from cutlass_library.rank_2k_operation import *
+  from cutlass_library.trmm_operation import *
+  from cutlass_library.symm_operation import *
+  from cutlass_library.conv2d_operation import *
+  from cutlass_library.conv3d_operation import *
+except ImportError:
+  from library import *
+  from gemm_operation import *
+  from rank_k_operation import *
+  from rank_2k_operation import *
+  from trmm_operation import *
+  from symm_operation import *
+  from conv2d_operation import *
+  from conv3d_operation import *
+###################################################################################################
+_LOGGER = logging.getLogger(__name__)
+class EmitOperationKindAll:
+  """
+  Emit the OperationKind-level CUTLASS library initialization code.
+  The code is generated in the {generated_path}/{operation_kind} directory
+  (e.g., tools/library/generated/gemm in the build directory,
+  for OperationKind=Gemm), in the all_{operation_kind}_operations.cu file
+  (e.g., all_gemm_operations.cu for OperationKind=Gemm).
+  That file declares several functions in namespace cutlass::library.
+  The functions all have this form,
+  void initialize_{configuration_name}(Manifest& manifest);
+  The file also _defines_ the following function in that namespace.
+  void initialize_all_{operation_kind}_operations(Manifest& manifest);
+  That function calls all of the functions declared in this file.
+  Those functions are defined in subdirectories
+  (which this class does not create).
+  """
+  def __init__(self, generated_path, kind, args):
+    self.generated_path = generated_path
+    self.kind = kind
+    self.args = args
+    self.header_template ="""
+/*
+ Generated by manifest.py - Do not edit.
+*/
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+namespace cutlass {
+namespace library {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+    self.entry_template = """
+//
+// Entry point to construct operations
+//
+void initialize_all_${operation_name}_operations(Manifest &manifest) {
+"""
+    self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
+    self.configuration_template ="  initialize_${configuration_name}(manifest);\n"
+    self.epilogue_template ="""}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace library
+} // namespace cutlass
+"""
+  #
+  def __enter__(self):
+    _LOGGER.debug("*** EmitOperationKindAll::__enter__")
+    self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind])
+    _LOGGER.debug('***   operation_path (directory to create): ' +
+                  str(self.operation_path));
+    os.makedirs(self.operation_path, exist_ok=True)
+    self.top_level_path = os.path.join(self.operation_path, f"all_{OperationKindNames[self.kind]}_operations.cu")
+    _LOGGER.debug(f"***   top_level_path (file to write): {str(self.top_level_path)}")
+    self.top_level_file = open(self.top_level_path, "w")
+    self.top_level_file.write(self.header_template)
+    self.source_files = [self.top_level_path,]
+    self.configurations = []
+    return self
+  #
+  def emit(self, operations):
+    _LOGGER.debug('*** EmitOperationKindAll::emit')
+    _LOGGER.debug(f"***   len(operations): {len(operations)}")
+    _LOGGER.debug(f"***   min_cc list: {sorted(min_cc for min_cc, _ in operations.items())}")
+    for min_cc, configurations in sorted(operations.items()):
+      _LOGGER.debug(f"***   min_cc={min_cc}")
+      for configuration_name, _ in configurations.items():
+        _LOGGER.debug(f"***     configuration_name={configuration_name}")
+        self.configurations.append(configuration_name)
+        self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+    _LOGGER.debug("*** EmitOperationKindAll::__exit__")
+    self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]}))
+    for configuration_name in self.configurations:
+      self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name}))
+    self.top_level_file.write(self.epilogue_template)
+    self.top_level_file.close()
+class EmitOperationKindLibrary:
+  """
+  Emit the CUTLASS library initialization code for each OperationKind.
+  The code is generated in the directory
+  {generated_path}/{operation_kind}/{min_cc}
+  (e.g., tools/library/generated/gemm/90 in the build directory,
+  for min_cc=90 and OperationKind=Gemm), in the file
+  all_sm{min_cc}_{operation_kind}_operations.cu
+  (e.g., all_sm90_gemm_operations.cu for min_cc=90 and OperationKind=Gemm).
+  The min_cc variable here indicates the minimum GPU architecture version
+  that the things to be initialized require.
+  For example, min_cc=90 indicates sm90.
+  That file declares several functions in namespace cutlass::library.
+  The functions all have this form,
+  void initialize_all_sm{min_cc}_{subclass_name}_{extended_name}_operations(Manifest& manifest);
+  where extended_name is operation.extended_name() for all the operations
+  given to the emit method (which see below).  (All operations for a given
+  configuration_name are guaranteed to have the same extended_name().)
+  The file also _defines_ the following function in that namespace.
+  void initialize_all_sm{min_cc}__{operation_kind}_operations(Manifest& manifest);
+  That function calls all of the functions declared in this file.
+  Those functions are defined in subdirectories.
+  The mapping from OperationKind to emitter handles the details
+  of what happens in each of those subdirectories.
+  """
+  def __init__(self, generated_path, min_cc, kind, args):
+    self.generated_path = generated_path
+    self.min_cc = min_cc
+    self.kind = kind
+    self.args = args
+    self.emitters = {
+      OperationKind.Gemm: EmitGemmConfigurationLibrary,
+      OperationKind.Conv2d: EmitConv2dConfigurationLibrary,
+      OperationKind.Conv3d: EmitConv3dConfigurationLibrary,
+      OperationKind.RankK: EmitRankKConfigurationLibrary,
+      OperationKind.Rank2K: EmitRank2KConfigurationLibrary,
+      OperationKind.Trmm: EmitTrmmConfigurationLibrary,
+      OperationKind.Symm: EmitSymmConfigurationLibrary
+    }
+    self.header_template ="""
+/*
+ Generated by manifest.py - Do not edit.
+*/
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+namespace cutlass {
+namespace library {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+    self.entry_template = """
+//
+// Entry point to construct operations
+//
+void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest) {
+"""
+    self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
+    self.configuration_template = "  initialize_${configuration_name}(manifest);\n"
+    self.subclass_call_template = "  initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(manifest);\n"
+    self.subclass_prototype_template = "void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest);\n"
+    self.epilogue_template ="""}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace library
+} // namespace cutlass
+"""
+  #
+  def __enter__(self):
+    _LOGGER.debug("*** EmitOperationKindLibrary::__enter__")
+    _LOGGER.debug(f"***   generated_path: {str(self.generated_path)}")
+    _LOGGER.debug(f"***   OperationKindNames[kind]: {OperationKindNames[self.kind]}")
+    _LOGGER.debug(f"***   min_cc: {self.min_cc}")
+    self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind], str(self.min_cc))
+    _LOGGER.debug(f"***   operation_path (directory to make): {str(self.operation_path)}")
+    os.makedirs(self.operation_path)
+    self.top_level_path = os.path.join(self.operation_path, f"all_sm{self.min_cc}_{OperationKindNames[self.kind]}_operations.cu")
+    _LOGGER.debug(f"***   top_level_path (file to write): {str(self.top_level_path)}")
+    self.top_level_file = open(self.top_level_path, "w")
+    self.top_level_file.write(self.header_template)
+    self.source_files = {}
+    # Each {operation_kind x cc} combination is further decomposed by the instruction
+    # types used. This dictionary used to track the file handles for the top-level
+    # files of each subclass
+    self.subclass_files = {}
+    # Configurations in each sub class
+    self.subclass_configurations = {}
+    return self
+  #
+  def emit(self, configuration_name, operations):
+    _LOGGER.debug("*** EmitOperationKindLibrary::emit")
+    _LOGGER.debug(f"***   configuration_name: {configuration_name}")
+    assert len(operations) > 0
+    # The extended name for all operations of a given configuration_name is guaranteed
+    # to be the same because extended_name() is used in defining configuration_name. Thus,
+    # we can safely use the extended_name() of the first operation.
+    extended_name = operations[0].extended_name()
+    _LOGGER.debug('***   extended_name (for all ops): ' + extended_name)
+    # Create a directory for operations with this subclass if it does not exist
+    if extended_name not in self.subclass_files:
+      subclass_path = os.path.join(self.operation_path, extended_name)
+      _LOGGER.debug(f"***     subclass_path: {str(subclass_path)}")
+      os.mkdir(subclass_path)
+      self.subclass_configurations[extended_name] = []
+      # Open a new top-level file for this sub class
+      subclass_top_level_path = os.path.join(
+        subclass_path, f"all_sm{self.min_cc}_{extended_name}_{OperationKindNames[self.kind]}_operations.cu")
+      _LOGGER.debug('***     subclass_top_level_path (min_cc, extended_name, ' +
+                    'OperationKind): ' + str(subclass_top_level_path))
+      self.subclass_files[extended_name] = open(subclass_top_level_path, "w")
+      self.subclass_files[extended_name].write(self.header_template)
+      self.source_files[extended_name] = [subclass_top_level_path]
+    subclass_dir = os.path.dirname(self.subclass_files[extended_name].name)
+    _LOGGER.debug('***   subclass_dir: ' + str(subclass_dir))
+    with self.emitters[self.kind](subclass_dir, configuration_name) as configuration_emitter:
+      for operation in operations:
+        configuration_emitter.emit(operation)
+      _LOGGER.debug('***   configuration_emitter.configuration_path: ' +
+                    str(configuration_emitter.configuration_path))
+      self.source_files[extended_name].append(configuration_emitter.configuration_path)
+    self.subclass_configurations[extended_name].append(configuration_name)
+    self.subclass_files[extended_name].write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+    _LOGGER.debug("*** EmitOperationKindLibrary::__exit__")
+    for subclass_name, subclass_file in sorted(self.subclass_files.items()):
+      subclass_cfg = {
+        'min_cc': str(self.min_cc),
+        'subclass_name': subclass_name,
+        'operation_name': OperationKindNames[self.kind]
+      }
+      self.top_level_file.write(SubstituteTemplate(self.subclass_prototype_template, subclass_cfg))
+    self.top_level_file.write(
+      SubstituteTemplate(self.entry_template, {
+        'min_cc': str(self.min_cc),
+        'subclass_name': '',
+        'operation_name': OperationKindNames[self.kind]
+      }))
+    # Finish and close all subclass files
+    for subclass_name, subclass_file in sorted(self.subclass_files.items()):
+      subclass_cfg = {
+        'min_cc': str(self.min_cc),
+        'subclass_name': subclass_name,
+        'operation_name': OperationKindNames[self.kind]
+      }
+      subclass_file.write(SubstituteTemplate(self.entry_template, subclass_cfg))
+      for configuration in self.subclass_configurations[subclass_name]:
+        subclass_file.write(
+          SubstituteTemplate(self.configuration_template, {
+            'configuration_name': configuration
+          }))
+      subclass_file.write(self.epilogue_template)
+      subclass_file.close()
+      # Write the call to initialize_all for this subclass to the top-level file
+      self.top_level_file.write(SubstituteTemplate(self.subclass_call_template, subclass_cfg))
+    self.top_level_file.write(self.epilogue_template)
+    self.top_level_file.close()
+class EmitInterfaceLibrary:
+  """
+  Emit the topmost-level CUTLASS library initialization code.
+  The code is generated in the generated_path directory
+  (e.g., tools/library/generated in the build directory),
+  in the initialize_all.cpp file.
+  That file declares several functions in namespace cutlass::library.
+  The functions all have this form,
+  void initialize_all_{operation_kind}_operations(Manifest& manifest);
+  where {operation_kind} abbreviates the "kind" of operation
+  (e.g., gemm for matrix-matrix multiply, conv2d for 2-d convolution,
+  or trmm for triangular solve with multiple right-hand sides).
+  The definitions of these functions live in subdirectories.
+  The file also _defines_ the following function in that namespace.
+  void initialize_all(Manifest& manifest);
+  That function first prepares the manifest, and then
+  calls all of the functions declared in this file.
+  """
+  def __init__(self, generated_path, operation_count, args):
+    self.generated_path = generated_path
+    self.args = args
+    self.prototypes = []
+    self.fn_calls = []
+    self.operation_count = str(operation_count)
+    self.top_level_hdr_template = '''
+/*
+ Generated by manifest.py - Do not edit.
+*/
+'''
+    self.top_level_prologue = '''
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+namespace cutlass {
+\tnamespace library {
+${prototypes}
+'''
+    self.top_level_initialize_kind = '''
+\t\tvoid initialize_all_${kind}_operations(Manifest &manifest) {
+${fn_calls}
+\t\t}
+'''
+    self.top_level_initialize = '''
+\t\tvoid initialize_all(Manifest &manifest) {
+\t\t\tmanifest.reserve(${operation_count});\n
+${fn_calls}
+\t\t}
+'''
+    self.top_level_suffix = '''
+\t} // namespace library
+} // namespace cutlass
+'''
+  #
+  def __enter__(self):
+    _LOGGER.debug("*** EmitInterfaceLibrary::__enter__")
+    self.top_level_path = os.path.join(self.generated_path, 'initialize_all.cpp')
+    _LOGGER.debug("***   top_level_path: " + str(self.top_level_path))
+    self.top_level_file = open(self.top_level_path, "w")
+    self.top_level_file.write(self.top_level_hdr_template)
+    self.source_files = [self.top_level_path,]
+    return self
+  #
+  def emit(self, operation_name):
+    _LOGGER.debug("*** EmitInterfaceLibrary::emit")
+    _LOGGER.debug("***   operation_name: " + operation_name)
+    self.prototypes.append(SubstituteTemplate(
+       "\t\tvoid initialize_all_${operation_kind}_operations(Manifest &manifest);",
+       {'operation_kind': operation_name}))
+    self.fn_calls.append(SubstituteTemplate(
+      "\t\t\tinitialize_all_${operation_kind}_operations(manifest);",
+      {'operation_kind': operation_name}))
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+    _LOGGER.debug("*** EmitInterfaceLibrary::__exit__")
+    self.top_level_file.write(SubstituteTemplate(self.top_level_prologue, {'prototypes':"\n".join(self.prototypes)}))
+    # Write out initialize_all method
+    self.top_level_file.write(SubstituteTemplate(self.top_level_initialize,
+                              {'operation_count': self.operation_count, 'fn_calls':"\n".join(self.fn_calls)}))
+    self.top_level_file.write(self.top_level_suffix)
+    self.top_level_file.close()
+###################################################################################################
+###################################################################################################
+class Options:
+  def __init__(self):
+    pass
+###################################################################################################
+#
+class Manifest:
+  #
+  def __init__(self, args = None):
+    self.operations = {}
+    self.args = args
+    self.operation_count = 0
+    self.operations_by_name = {}
+    self.kernel_filter = ''
+    self.kernel_filter_list = []
+    self.kernel_names = []
+    self.operations_enabled = []
+    self.selected_kernels = []
+    self.ignore_kernel_names = []
+    self.exclude_kernel_names = []
+    self.compute_capabilities_baseline = [50,]
+    self.compute_capabilities_feature_set = ['50',]
+    self.curr_build_dir = '.'
+    self.filter_by_cc = True
+    if self.args:
+      self.kernel_filter = self.args.kernels
+      self.curr_build_dir = args.curr_build_dir
+      # A common user error is to use commas instead of semicolons.
+      if ',' in args.architectures:
+        raise RuntimeError("The list of architectures (CMake option CUTLASS_NVCC_ARCHS) must be semicolon-delimited.\nDon't use commas to separate the architectures; use semicolons.\nYou specified the list as: " + args.architectures)
+      self.compute_capabilities_feature_set = args.architectures.split(';') if len(args.architectures) else ['50',]
+      self.compute_capabilities_baseline = sorted(set(int(arch.split('a')[0].split('f')[0]) for arch in self.compute_capabilities_feature_set))
+      if args.filter_by_cc in ['false', 'False', '0']:
+        self.filter_by_cc = False
+      if args.operations == 'all':
+        self.operations_enabled = []
+      else:
+        operations_list = [
+          OperationKind.Gemm
+          , OperationKind.Conv2d
+          , OperationKind.Conv3d
+            , OperationKind.RankK
+            , OperationKind.Trmm
+            , OperationKind.Symm
+        ]
+        self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
+      if args.kernels == 'all':
+        self.kernel_names = []
+      else:
+        self.kernel_names = [x for x in args.kernels.split(',') if x != '']
+      self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
+      self.exclude_kernel_names = [x for x in args.exclude_kernels.split(',') if x != '']
+      if args.kernel_filter_file is None:
+          self.kernel_filter_list = []
+      else:
+          self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
+          _LOGGER.debug("Using {filter_count} kernel filters from {filter_file}".format(
+              filter_count = len(self.kernel_filter_list),
+              filter_file = args.kernel_filter_file))
+      self.operation_count = 0
+      self.operations_by_name = {}
+      self.disable_full_archs_compilation = args.disable_full_archs_compilation
+      self.is_kernel_filter_set_to_all = args.instantiation_level == "max" and args.kernels != ''
+      self.instantiation_level = 0
+      try:
+          self.instantiation_level = int(args.instantiation_level)
+      except ValueError:
+          self.instantiation_level = 0
+  def add_kernel_filter(self, filter_str):
+    filter_re = re.compile(filter_str)
+    self.kernel_filter_list.append(filter_re)
+  def get_instantiation_level(self, pruned_level=0, default_level=111, exhaustive_level=9992):
+    # Non-negative integer which determines how many kernels are instantiated.
+    # 0 = 0000 generates the fewest kernels, 9999 generates all possible combinations.
+    # increasing first digit reduces schedule / mixed type pruning,
+    # increasing second digit generates more cluster sizes,
+    # increasing third digit generates more MMA multipliers,
+    # increasing fourth digit generates more instruction shapes.
+    if self.instantiation_level > 0:
+        return self.instantiation_level
+    elif self.is_kernel_filter_set_to_all:
+        return exhaustive_level
+    elif self.kernel_filter == '':
+        return pruned_level
+    else:
+        return default_level
+  def get_kernel_filters(self, kernelListFile):
+    if os.path.isfile(kernelListFile):
+        with open(kernelListFile, 'r') as fileReader:
+            lines = [line.rstrip() for line in fileReader if not line.startswith("#")]
+        lines = [re.compile(line) for line in lines if line]
+        return lines
+    else:
+        return []
+  #
+  def filter_out_kernels(self, kernel_name, kernel_filter_list):
+    for kernel_filter_re in kernel_filter_list:
+        if kernel_filter_re.search(kernel_name) is not None:
+            return True
+    return False
+  #
+  def _filter_string_matches(self, filter_string, haystack):
+    ''' Returns true if all substrings appear in the haystack in order'''
+    substrings = filter_string.split('*')
+    for sub in substrings:
+      idx = haystack.find(sub)
+      if idx < 0:
+        return False
+      haystack = haystack[idx + len(sub):]
+    return True
+  #
+  def filter(self, operation):
+    ''' Filtering operations based on various criteria'''
+    # filter based on compute capability
+    enabled = not (self.filter_by_cc)
+    for cc in self.compute_capabilities_baseline:
+      if cc >= operation.tile_description.minimum_compute_capability and \
+         cc <= operation.tile_description.maximum_compute_capability and \
+         (cc not in SharedMemPerCC or SharedMemPerCC[cc] >= CalculateSmemUsage(operation)):
+        enabled = True
+        break
+    if not enabled:
+      return False
+    if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled:
+      return False
+    name = operation.procedural_name()
+    # eliminate duplicates
+    if name in self.operations_by_name.keys():
+      return False
+    # Filter based on list of valid substrings
+    if len(self.kernel_names):
+      enabled = False
+      # compare against the include list
+      for name_substr in self.kernel_names:
+        if self._filter_string_matches(name_substr, name):
+          _LOGGER.debug(f"Kernel {name} included due to filter string '{name_substr}'.")
+          enabled = True
+          break
+        else:
+          _LOGGER.debug(f"Kernel {name} NOT included due to not matching '{name_substr}'.")
+      # compare against the exclude list
+      for name_substr in self.ignore_kernel_names:
+        if self._filter_string_matches(name_substr, name):
+          _LOGGER.debug(f"Kernel {name} ignored due to filter string '{name_substr}'.")
+          enabled = False
+          break
+        else:
+          _LOGGER.debug(f"Kernel {name} NOT ignored due to not matching '{name_substr}'.")
+    if len(self.kernel_filter_list) > 0:
+      if self.filter_out_kernels(name, self.kernel_filter_list):
+        _LOGGER.debug(f"Kernel {name} matched via kernel filter file.")
+        enabled = True
+      else:
+        _LOGGER.debug(f"Kernel {name} culled due to no match in kernel filter file.")
+        enabled = False
+    # CUTLASS_LIBRARY_IGNORE_KERNELS ("ignore" list) only takes effect
+    # if CUTLASS_LIBRARY_KERNELS was specified.
+    # Changing that would break backwards compatibility.
+    # Thus, CUTLASS has introduced the new CMake option CUTLASS_LIBRARY_EXCLUDE_KERNELS,
+    # that always takes effect, whether or not CUTLASS_LIBRARY_KERNELS was specified.
+    for name_substr in self.exclude_kernel_names:
+      if self._filter_string_matches(name_substr, name):
+        _LOGGER.debug(f"Kernel {name} excluded due to filter string '{name_substr}'.")
+        enabled = False
+        break
+      else:
+        _LOGGER.debug(f"Kernel {name} NOT excluded due to not matching '{name_substr}'.")
+    # TODO: filter based on compute data type
+    return enabled
+  #
+  #
+  def append(self, operation):
+    '''
+      Inserts the operation.
+      operation_kind -> configuration_name -> []
+    '''
+    if self.filter(operation):
+      self.selected_kernels.append(operation.procedural_name())
+      self.operations_by_name[operation.procedural_name()] = operation
+      # add the configuration
+      configuration_name = operation.configuration_name()
+      # Split operations by minimum CC
+      min_cc = operation.arch
+      if operation.operation_kind not in self.operations.keys():
+        self.operations[operation.operation_kind] = {}
+      if min_cc not in self.operations[operation.operation_kind]:
+        self.operations[operation.operation_kind][min_cc] = {}
+      if configuration_name not in self.operations[operation.operation_kind][min_cc].keys():
+        self.operations[operation.operation_kind][min_cc][configuration_name] = []
+      self.operations[operation.operation_kind][min_cc][configuration_name].append(operation)
+      self.operation_count += 1
+    else:
+      _LOGGER.debug("Culled {} from manifest".format(operation.procedural_name()))
+  #
+  def emit_manifest_cmake(self, manifest_path, top_level_path, source_files):
+    with open(manifest_path, "w") as manifest_file:
+      target_text = SubstituteTemplate("""cutlass_target_sources(cutlass_library_objs PRIVATE
+      """, { })
+      manifest_file.write(target_text + '\n\n')
+      manifest_file.write("    %s\n" % str(top_level_path.replace('\\', '/')))
+      generated_path = os.path.join(self.curr_build_dir, 'generated')
+      for kind in self.operations.keys():
+        kind_str = OperationKindNames[kind]
+        all_kind_file = os.path.join(generated_path, kind_str, f"all_{kind_str}_operations.cu").replace('\\', '/')
+        manifest_file.write(f"    {all_kind_file}\n")
+      manifest_file.write(')\n\n')
+      for kind in self.operations.keys():
+        for min_cc in sorted(self.operations[kind].keys()):
+          for subclass in sorted(source_files[kind][min_cc].keys()):
+            target_text = SubstituteTemplate("""cutlass_add_cutlass_library(
+      SUFFIX ${kind}_sm${min_cc}_${subclass}
+""", { 'min_cc': str(min_cc), 'kind': OperationKindNames[kind], 'subclass': subclass })
+            manifest_file.write(target_text + '\n\n')
+            for source_file in source_files[kind][min_cc][subclass]:
+              manifest_file.write("    %s\n" % str(source_file.replace('\\', '/')))
+            manifest_file.write(")\n")
+          if self.disable_full_archs_compilation:
+            self.emit_disable_full_archs_compilation(manifest_file, source_files)
+  def emit_disable_full_archs_compilation(manifest_file, source_files):
+      def for_hopper(name):
+          pass
+      def for_ampere(name):
+          return "16816" in name or \
+                  "16832" in name or \
+                  "16864" in name or \
+                  ("1688" in name and "tf32" in name)
+      def for_turing(name):
+          return ("1688" in name and "tf32" not in name) or \
+                  "8816" in name
+      def for_volta(name):
+          return "884" in name
+      def is_cpp(name):
+          return name.endswith(".cpp")
+      def get_src_archs_str_given_requested_cuda_archs(archs, source_file):
+          intersected_archs = archs & set(self.compute_capabilities_baseline)
+          if intersected_archs == set():
+              raise RuntimeError(
+                    """
+                    Empty archs set for file {} after taking
+                    the intersection of {} (global requested archs) and
+                    {} (per file requested archs)
+                    """.format(source_file, set(self.compute_capabilities_baseline), archs))
+          else:
+              return " ".join(map(str, intersected_archs))
+      for min_cc in sorted(source_files.keys()):
+        for source_file in source_files[min_cc]:
+            if is_cpp(source_file):
+                continue # skip because source is cpp
+            elif for_ampere(source_file):
+                archs_str = get_src_archs_str_given_requested_cuda_archs({80, 87, 90}, source_file)
+            elif for_turing(source_file):
+                archs_str = get_src_archs_str_given_requested_cuda_archs({75}, source_file)
+            elif for_volta(source_file):
+                archs_str = get_src_archs_str_given_requested_cuda_archs({70, 72}, source_file)
+            else:
+                raise RuntimeError("Per file archs are not set {}, as there is no rule specified for this file pattern".format(source_file))
+            manifest_file.write("cutlass_apply_cuda_gencode_flags({} SM_ARCHS {})\n".format(str(source_file.replace('\\', '/')), archs_str))
+  #
+  def emit(self, target = GeneratorTarget.Library):
+    operation_emitters = {
+      GeneratorTarget.Library: EmitOperationKindLibrary
+    }
+    # Emitters for all operations that fall under a particular kind (e.g., GEMM, Conv2d)
+    kind_emitters = {
+      GeneratorTarget.Library: EmitOperationKindAll
+    }
+    interface_emitters = {
+      GeneratorTarget.Library: EmitInterfaceLibrary
+    }
+    generated_path = os.path.join(self.curr_build_dir, 'generated')
+    # create generated/
+    if os.path.exists(generated_path):
+      shutil.rmtree(generated_path)
+    os.mkdir(generated_path)
+    with interface_emitters[target](generated_path, self.operation_count, self.args) as iface_emitter:
+      top_level_path = iface_emitter.top_level_path
+      for operation_kind in self.operations.keys():
+        iface_emitter.emit(OperationKindNames[operation_kind])
+    source_files = {}
+    for kind in self.operations.keys():
+      source_files[kind] = {}
+      for min_cc in self.operations[kind].keys():
+        source_files[kind][min_cc] = {}
+    for operation_kind, ops in self.operations.items():
+      for min_cc, configurations in sorted(ops.items()):
+        with operation_emitters[target](generated_path, min_cc, operation_kind, self.args) as operation_kind_emitter:
+          for configuration_name, operations in configurations.items():
+            _LOGGER.info(f"Emitting {configuration_name} with {len(operations)} operation{'' if len(operations) == 1 else 's'}.")
+            operation_kind_emitter.emit(configuration_name, operations)
+          for subclass, files in operation_kind_emitter.source_files.items():
+            if subclass not in source_files[operation_kind][min_cc]:
+              source_files[operation_kind][min_cc][subclass] = []
+            source_files[operation_kind][min_cc][subclass].extend(operation_kind_emitter.source_files[subclass])
+      # Emit top level all_{gemm, conv2d, ...}_operations.cu files
+      with kind_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
+        operation_kind_emitter.emit(ops)
+    # write the manifest.cmake file containing paths from all targets
+    manifest_path = os.path.join(generated_path, "manifest.cmake")
+    self.emit_manifest_cmake(manifest_path, top_level_path, source_files)
+###################################################################################################

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/rank_2k_operation.py ADDED Viewed

	@@ -0,0 +1,438 @@

+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for emitting Rank2K kernels
+"""
+import enum
+import functools
+import operator
+import os.path
+import shutil
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+###################################################################################################
+#
+# Data structure modeling a Rank K update operation
+#
+###################################################################################################
+#
+class Rank2KOperation:
+  #
+  def __init__(self, rank_k_kind, arch, tile_description, A, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      blas_mode = BlasMode.symmetric):
+    self.blas_mode = blas_mode
+    self.operation_kind = OperationKind.Rank2K
+    self.arch = arch
+    self.tile_description = tile_description
+    self.rank_k_kind = rank_k_kind
+    # tensor A and B have same data type and layout
+    self.A = A
+    self.B = A
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.swizzling_functor = swizzling_functor
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+    return False
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+  #
+  def is_planar_complex(self):
+    return False
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+    if self.is_complex():
+      return get_complex_from_real(accum)
+    return accum
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
+    }
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
+      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      inst_shape += math_op_string
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    operation_name = 'syr2k' if self.blas_mode == BlasMode.symmetric else 'her2k'
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, operation_name)
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${element_c}_${core_name}_${element_a}"
+      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${core_name}_${element_a}"
+      else:
+        extended_name = "${core_name}"
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+    return extended_name
+  #
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)]
+      )
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+  #
+  def fill_mode_name(self):
+    return "%s" % (ShortFillModeNames[self.C.fill_mode])
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    threadblock = self.tile_description.procedural_name()
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    alignment = max([self.A.alignment, self.C.alignment])
+    return SubstituteTemplate(
+      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${fill_mode}_align${alignment}",
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'fill_mode': self.fill_mode_name(),
+        'alignment': "%d" % self.A.alignment,
+      }
+    )
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+#
+class EmitRank2KUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+  def __init__(self):
+    self.rank_k_template = """
+// Rank K operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::Rank2K<
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c}, ${fill_mode},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation}
+>;
+"""
+    self.rank_k_complex_template = """
+// Rank K operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::Rank2K<
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c}, ${fill_mode},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation},
+    ${transform_a},
+    ${transform_b},
+    ${blas_mode}
+>;
+"""
+  def emit(self, operation):
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'fill_mode': FillModeTag[operation.C.fill_mode],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'split_k_serial': 'false',
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'blas_mode': BlasModeTag[operation.blas_mode]
+    }
+    rank_k_template = self.rank_k_complex_template if operation.is_complex() else self.rank_k_template
+    return SubstituteTemplate(rank_k_template, values)
+###################################################################################################
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+class EmitRank2KConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+    self.instance_emitter = {
+      RankKKind.Universal: EmitRank2KUniversalInstance,
+    }
+    self.rank_k_kind_wrappers = {
+      RankKKind.Universal: 'Rank2KOperation',
+    }
+    self.instance_template = {
+      RankKKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${rank_k_kind}<
+    Operation_${operation_name}
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+    }
+    self.header_template = """
+/*
+  Generated by rank_2k_operation.py - Do not edit.
+*/
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "library_internal.h"
+#include "rank_2k_operation.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+    self.initialize_function_template = """
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace library {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+void initialize_${configuration_name}(Manifest &manifest) {
+"""
+    self.epilogue_template = """
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace library
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+    self.instance_definitions = []
+    self.instance_wrappers = []
+    self.operations = []
+    return self
+  def emit(self, operation):
+    emitter = self.instance_emitter[operation.rank_k_kind]()
+    self.operations.append(operation)
+    self.instance_definitions.append(emitter.emit(operation))
+    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.rank_k_kind], {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'rank_k_kind': self.rank_k_kind_wrappers[operation.rank_k_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
+      }))
+  def __exit__(self, exception_type, exception_value, traceback):
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+###################################################################################################

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/rank_k_operation.py ADDED Viewed

	@@ -0,0 +1,427 @@

+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for emitting RankK kernels
+"""
+import enum
+import functools
+import operator
+import os.path
+import shutil
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+###################################################################################################
+#
+# Data structure modeling a Rank K update operation
+#
+###################################################################################################
+#
+class RankKOperation:
+  #
+  def __init__(self, rank_k_kind, arch, tile_description, A, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      blas_mode = BlasMode.symmetric):
+    self.blas_mode = blas_mode
+    self.operation_kind = OperationKind.RankK
+    self.arch = arch
+    self.tile_description = tile_description
+    self.rank_k_kind = rank_k_kind
+    self.A = A
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.swizzling_functor = swizzling_functor
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+    return False
+  #
+  def is_mixed_input(self):
+    return False
+  #
+  def is_planar_complex(self):
+    return False
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+    if self.is_complex():
+      return get_complex_from_real(accum)
+    return accum
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
+    }
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
+      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      inst_shape += math_op_string
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    operation_name = 'syrk' if self.blas_mode == BlasMode.symmetric else 'herk'
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, operation_name)
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${element_c}_${core_name}_${element_a}"
+      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${core_name}_${element_a}"
+      else:
+        extended_name = "${core_name}"
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+    return extended_name
+  #
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)]
+      )
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+  #
+  def fill_mode_name(self):
+    return "%s" % (ShortFillModeNames[self.C.fill_mode])
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    threadblock = self.tile_description.procedural_name()
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    alignment = max([self.A.alignment, self.C.alignment])
+    return SubstituteTemplate(
+      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${fill_mode}_align${alignment}",
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'fill_mode': self.fill_mode_name(),
+        'alignment': "%d" % self.A.alignment,
+      }
+    )
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+#
+class EmitRankKUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+  def __init__(self):
+    self.rank_k_template = """
+// Rank K operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::RankK<
+    ${element_a}, ${layout_a},
+    ${element_c}, ${layout_c}, ${fill_mode},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${split_k_serial},
+    ${math_operation}
+>;
+"""
+    self.rank_k_complex_template = """
+// Rank K operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::RankK<
+    ${element_a}, ${layout_a},
+    ${element_c}, ${layout_c}, ${fill_mode},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${split_k_serial},
+    ${math_operation},
+    ${transform_a},
+    ${blas_mode}
+>;
+"""
+  def emit(self, operation):
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'fill_mode': FillModeTag[operation.C.fill_mode],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'split_k_serial': 'false',
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'blas_mode': BlasModeTag[operation.blas_mode]
+    }
+    rank_k_template = self.rank_k_complex_template if operation.is_complex() else self.rank_k_template
+    return SubstituteTemplate(rank_k_template, values)
+###################################################################################################
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+class EmitRankKConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+    self.instance_emitter = {
+      RankKKind.Universal: EmitRankKUniversalInstance,
+    }
+    self.rank_k_kind_wrappers = {
+      RankKKind.Universal: 'RankKOperation',
+    }
+    self.instance_template = {
+      RankKKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${rank_k_kind}<
+    Operation_${operation_name}
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+    }
+    self.header_template = """
+/*
+  Generated by rank_k_operation.py - Do not edit.
+*/
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "library_internal.h"
+#include "rank_k_operation.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+    self.initialize_function_template = """
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace library {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+void initialize_${configuration_name}(Manifest &manifest) {
+"""
+    self.epilogue_template = """
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace library
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+    self.instance_definitions = []
+    self.instance_wrappers = []
+    self.operations = []
+    return self
+  def emit(self, operation):
+    emitter = self.instance_emitter[operation.rank_k_kind]()
+    self.operations.append(operation)
+    self.instance_definitions.append(emitter.emit(operation))
+    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.rank_k_kind], {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'rank_k_kind': self.rank_k_kind_wrappers[operation.rank_k_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
+      }))
+  def __exit__(self, exception_type, exception_value, traceback):
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+###################################################################################################

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/sm100_shapes.py ADDED Viewed

	@@ -0,0 +1,342 @@

+#################################################################################################
+#
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Valid tcgen05 shapes and cluster sizes for SM100, associated with levels.
+These shape and level pairs are defined as dicts, where keys are shapes and values are their
+associated levels. If the user input level for that category (tcgen05 shape, cluster
+size) is smaller than a shape's associated level, it will be excluded, and otherwise, included.
+Higher levels are therefore less likely emitted, but lower levels are more emitted more frequently.
+Level 0 is always emitted.
+"""
+try:
+    from .library import DynamicClusterShape
+except:
+    from library import DynamicClusterShape
+SM100_CLUSTER_SHAPES_1SM = {
+    tuple(DynamicClusterShape) : 0,
+    # size 1 cluster
+    (1, 1, 1): 1,
+    # size 2 cluster
+    (1, 2, 1): 2,
+    (2, 1, 1): 5,
+    # size 4 clusters
+    (2, 2, 1): 6,
+    (1, 4, 1): 3,
+    (4, 1, 1): 6,
+    # size 8 clusters
+    (2, 4, 1): 7,
+    (4, 2, 1): 7,
+    (1, 8, 1): 8,
+    (8, 1, 1): 8,
+    # size 16 cluster
+    (4, 4, 1): 4,
+}
+SM100_CLUSTER_SHAPES_2SM = {
+    tuple(DynamicClusterShape) : 0,
+    # size 2 cluster
+    (2, 1, 1): 1,
+    # size 4 clusters
+    (2, 2, 1): 2,
+    (4, 1, 1): 2,
+    # size 8 clusters
+    (2, 4, 1): 3,
+    (4, 2, 1): 3,
+    (8, 1, 1): 6,
+    # size 16 cluster
+    (4, 4, 1): 4,
+}
+# MMA shapes
+# 16b Dense
+SM100_MMA_SHAPES_16b_DENSE_1SM = {
+    (64,   8, 16): 5,
+    (64,  16, 16): 2,
+    (64,  24, 16): 5,
+    (64,  32, 16): 2,
+    (64,  40, 16): 5,
+    (64,  48, 16): 5,
+    (64,  56, 16): 5,
+    (64,  64, 16): 2,
+    (64,  72, 16): 5,
+    (64,  80, 16): 5,
+    (64,  88, 16): 5,
+    (64,  96, 16): 5,
+    (64, 104, 16): 5,
+    (64, 112, 16): 5,
+    (64, 120, 16): 5,
+    (64, 128, 16): 0,
+    (64, 136, 16): 5,
+    (64, 144, 16): 5,
+    (64, 152, 16): 5,
+    (64, 160, 16): 5,
+    (64, 168, 16): 5,
+    (64, 176, 16): 5,
+    (64, 184, 16): 5,
+    (64, 192, 16): 3,
+    (64, 200, 16): 5,
+    (64, 208, 16): 5,
+    (64, 216, 16): 5,
+    (64, 224, 16): 5,
+    (64, 232, 16): 5,
+    (64, 240, 16): 5,
+    (64, 248, 16): 5,
+    (64, 256, 16): 3,
+    (128,  16, 16): 2,
+    (128,  32, 16): 2,
+    (128,  48, 16): 5,
+    (128,  64, 16): 2,
+    (128,  80, 16): 5,
+    (128,  96, 16): 5,
+    (128, 112, 16): 5,
+    (128, 128, 16): 0,
+    (128, 144, 16): 5,
+    (128, 160, 16): 5,
+    (128, 176, 16): 5,
+    (128, 192, 16): 3,
+    (128, 208, 16): 5,
+    (128, 224, 16): 5,
+    (128, 240, 16): 5,
+    (128, 256, 16): 0,
+}
+SM100_MMA_SHAPES_16b_DENSE_2SM = {
+    (128,  32, 16): 2,
+    (128,  64, 16): 2,
+    (128,  96, 16): 5,
+    (128, 128, 16): 0,
+    (128, 160, 16): 5,
+    (128, 192, 16): 5,
+    (128, 224, 16): 5,
+    (128, 256, 16): 0,
+    (256,  32, 16): 2,
+    (256,  64, 16): 2,
+    (256,  96, 16): 5,
+    (256, 128, 16): 0,
+    (256, 160, 16): 5,
+    (256, 192, 16): 3,
+    (256, 224, 16): 5,
+    (256, 256, 16): 0,
+}
+# TF32 Dense
+SM100_MMA_SHAPES_TF32_DENSE_1SM = {
+    (64,   8, 8): 5,
+    (64,  16, 8): 2,
+    (64,  24, 8): 5,
+    (64,  32, 8): 2,
+    (64,  40, 8): 5,
+    (64,  48, 8): 5,
+    (64,  56, 8): 5,
+    (64,  64, 8): 1,
+    (64,  72, 8): 5,
+    (64,  80, 8): 5,
+    (64,  88, 8): 5,
+    (64,  96, 8): 5,
+    (64, 104, 8): 5,
+    (64, 112, 8): 5,
+    (64, 120, 8): 5,
+    (64, 128, 8): 0,
+    (64, 136, 8): 5,
+    (64, 144, 8): 5,
+    (64, 152, 8): 5,
+    (64, 160, 8): 5,
+    (64, 168, 8): 5,
+    (64, 176, 8): 5,
+    (64, 184, 8): 5,
+    (64, 192, 8): 3,
+    (64, 200, 8): 5,
+    (64, 208, 8): 5,
+    (64, 216, 8): 5,
+    (64, 224, 8): 5,
+    (64, 232, 8): 5,
+    (64, 240, 8): 5,
+    (64, 248, 8): 5,
+    (64, 256, 8): 3,
+    (128,  16, 8): 2,
+    (128,  32, 8): 2,
+    (128,  48, 8): 5,
+    (128,  64, 8): 2,
+    (128,  80, 8): 5,
+    (128,  96, 8): 5,
+    (128, 112, 8): 5,
+    (128, 128, 8): 0,
+    (128, 144, 8): 5,
+    (128, 160, 8): 5,
+    (128, 176, 8): 5,
+    (128, 192, 8): 3,
+    (128, 208, 8): 5,
+    (128, 224, 8): 5,
+    (128, 240, 8): 5,
+    (128, 256, 8): 0,
+}
+SM100_MMA_SHAPES_TF32_DENSE_2SM = {
+    (128,  32, 8): 2,
+    (128,  64, 8): 1,
+    (128,  96, 8): 5,
+    (128, 128, 8): 0,
+    (128, 160, 8): 5,
+    (128, 192, 8): 5,
+    (128, 224, 8): 5,
+    (128, 256, 8): 0,
+    (256,  32, 8): 2,
+    (256,  64, 8): 1,
+    (256,  96, 8): 5,
+    (256, 128, 8): 0,
+    (256, 160, 8): 5,
+    (256, 192, 8): 5,
+    (256, 224, 8): 5,
+    (256, 256, 8): 0,
+}
+# F8F6F4
+SM100_MMA_SHAPES_F8F6F4_DENSE_1SM = {
+    (64,   8, 32): 4,
+    (64,  16, 32): 4,
+    (64,  24, 32): 5,
+    (64,  32, 32): 3,
+    (64,  40, 32): 5,
+    (64,  48, 32): 5,
+    (64,  56, 32): 5,
+    (64,  64, 32): 2,
+    (64,  72, 32): 5,
+    (64,  80, 32): 5,
+    (64,  88, 32): 5,
+    (64,  96, 32): 5,
+    (64, 104, 32): 5,
+    (64, 112, 32): 5,
+    (64, 120, 32): 5,
+    (64, 128, 32): 0,
+    (64, 136, 32): 5,
+    (64, 144, 32): 5,
+    (64, 152, 32): 5,
+    (64, 160, 32): 5,
+    (64, 168, 32): 5,
+    (64, 176, 32): 5,
+    (64, 184, 32): 5,
+    (64, 192, 32): 5,
+    (64, 200, 32): 5,
+    (64, 208, 32): 5,
+    (64, 216, 32): 5,
+    (64, 224, 32): 5,
+    (64, 232, 32): 5,
+    (64, 240, 32): 5,
+    (64, 248, 32): 5,
+    (64, 256, 32): 0,
+    (128,  16, 32): 4,
+    (128,  32, 32): 3,
+    (128,  48, 32): 5,
+    (128,  64, 32): 2,
+    (128,  80, 32): 5,
+    (128,  96, 32): 5,
+    (128, 112, 32): 5,
+    (128, 128, 32): 0,
+    (128, 144, 32): 5,
+    (128, 160, 32): 5,
+    (128, 176, 32): 5,
+    (128, 192, 32): 5,
+    (128, 208, 32): 5,
+    (128, 224, 32): 5,
+    (128, 240, 32): 5,
+    (128, 256, 32): 0,
+}
+SM100_MMA_SHAPES_F8F6F4_DENSE_2SM = {
+    (128,  32, 32): 3,
+    (128,  64, 32): 2,
+    (128,  96, 32): 5,
+    (128, 128, 32): 1,
+    (128, 160, 32): 5,
+    (128, 192, 32): 5,
+    (128, 224, 32): 5,
+    (128, 256, 32): 1,
+    (256,  32, 32): 2,
+    (256,  64, 32): 2,
+    (256,  96, 32): 5,
+    (256, 128, 32): 0,
+    (256, 160, 32): 5,
+    (256, 192, 32): 5,
+    (256, 224, 32): 5,
+    (256, 256, 32): 0,
+}
+# MXF8F6F4
+SM100_MMA_SHAPES_MXF8F6F4_DENSE_1SM = {
+    (128,  64, 32): 1,
+    (128, 128, 32): 0,
+    (128, 192, 32): 1,
+    (128, 256, 32): 0,
+}
+SM100_MMA_SHAPES_MXF8F6F4_DENSE_2SM = {
+    (256,  64, 32): 1,
+    (256, 128, 32): 0,
+    (256, 192, 32): 1,
+    (256, 256, 32): 0,
+}
+# MXF4NVF4
+SM100_MMA_SHAPES_MXF4NVF4_DENSE_1SM = {
+    (128,  64, 64): 1,
+    (128, 128, 64): 0,
+    (128, 192, 64): 1,
+    (128, 256, 64): 0,
+}
+SM100_MMA_SHAPES_MXF4NVF4_DENSE_2SM = {
+    # Multiples of 16 for N
+    (256,  64, 64): 1,
+    (256, 128, 64): 0,
+    (256, 192, 64): 1,
+    (256, 256, 64): 0,
+}

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/sm100_utils.py ADDED Viewed

	@@ -0,0 +1,661 @@

+#################################################################################################
+#
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for enumerating CUTLASS library SM100 kernels
+"""
+import argparse
+import enum
+from itertools import product
+import math
+import logging
+import os.path
+import shutil
+import sys
+import copy
+from typing import Any, Optional, Sequence, Tuple, List, Union, Callable
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+#### Step 0: define levels
+# One integer level controls multiple "generators" and how many
+# combinations they generate. That is the "global" level.
+# "Generators" are WGMMA shapes, MMA multipliers, cluster sizes, and
+# anything that is eventually involved in the Cartesian product
+# which yields our kernel configurations.
+# For simplicity, each generator defines their own levels,
+# starting from 0. As a rule we assume 10 or fewer levels, making
+# their level a digit.
+# The "global" level simply stacks these digits and represents them
+# as a single integer.
+#
+# For example, level 500 indicates cluster sizes are at level 5, MMA
+# multipliers are at level 0, and WGMMA shapes are at level 0 as well.
+#
+# Here we define the global level to generator level mappings.
+def get_tcgen05_level_from_global_level(global_level: int):
+    return global_level % 10
+def get_mma_level_from_global_level(global_level: int):
+    return (global_level // 10) % 10
+def get_cluster_level_from_global_level(global_level: int):
+    return (global_level // 100) % 10
+def get_pruning_level_from_global_level(global_level: int):
+    return (global_level // 1000) % 10
+#### Step 1: generate MMA instruction shapes based on levels
+try:
+    from .sm100_shapes import *
+except:
+    from sm100_shapes import *
+###########
+def generate_tf32_math_instructions_sm100(level: int):
+    """
+    Generate all TensorOp math instructions for TF32 MMA that are supported by SM100 at or above the given level.
+    Args:
+        level: The global level to generate math instructions for.
+    Returns:
+        A tuple of two lists of MathInstruction objects.
+        The first list contains the math instructions for 1SM, and the second list contains the math instructions for 2SM.
+    """
+    tcgen05_level = get_tcgen05_level_from_global_level(level)
+    math_instructions_1sm = []
+    math_instructions_2sm = []
+    shapes_1sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_TF32_DENSE_1SM.items() if tcgen05_level >= min_level
+    ]
+    shapes_2sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_TF32_DENSE_2SM.items() if tcgen05_level >= min_level
+    ]
+    for shape in shapes_1sm:
+        math_instructions_1sm.append(
+          MathInstruction(
+              shape,
+              DataType.tf32, DataType.tf32, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+    for shape in shapes_2sm:
+        math_instructions_2sm.append(
+          MathInstruction(
+              shape,
+              DataType.tf32, DataType.tf32, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+    return math_instructions_1sm, math_instructions_2sm
+def generate_16b_math_instructions_sm100(level: int):
+    """
+    Generate all TensorOp math instructions for 16b MMA that are supported by SM100 at or above the given level.
+    Args:
+        level: The global level to generate math instructions for.
+    Returns:
+        A tuple of two lists of MathInstruction objects.
+        The first list contains the math instructions for 1SM, and the second list contains the math instructions for 2SM.
+    """
+    tcgen05_level = get_tcgen05_level_from_global_level(level)
+    math_instructions_1sm = []
+    math_instructions_2sm = []
+    shapes_1sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_16b_DENSE_1SM.items() if tcgen05_level >= min_level
+    ]
+    shapes_2sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_16b_DENSE_2SM.items() if tcgen05_level >= min_level
+    ]
+    for shape in shapes_1sm:
+        math_instructions_1sm.append(
+          MathInstruction(
+              shape,
+              DataType.f16, DataType.f16, DataType.f16,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+        math_instructions_1sm.append(
+          MathInstruction(
+              shape,
+              DataType.f16, DataType.f16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+        math_instructions_1sm.append(
+          MathInstruction(
+              shape,
+              DataType.bf16, DataType.bf16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+    for shape in shapes_2sm:
+        math_instructions_2sm.append(
+          MathInstruction(
+              shape,
+              DataType.f16, DataType.f16, DataType.f16,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+        math_instructions_2sm.append(
+          MathInstruction(
+              shape,
+              DataType.f16, DataType.f16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+        math_instructions_2sm.append(
+          MathInstruction(
+              shape,
+              DataType.bf16, DataType.bf16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+    return math_instructions_1sm, math_instructions_2sm
+def generate_fp8_math_instructions_sm100(level: int, enable_runtime_dtype = True, enable_compile_time_dtype = True):
+    """
+    Generate all TensorOp math instructions for FP8 MMA that are supported by SM100 at or above the given level.
+    Args:
+        level: The global level to generate math instructions for.
+        enable_runtime_dtype: Whether to generate runtime dtype math instructions.
+        enable_compile_time_dtype: Whether to generate compile time dtype math instructions.
+    Returns:
+        A tuple of two lists of MathInstruction objects.
+        The first list contains the math instructions for 1SM, and the second list contains the math instructions for 2SM.
+    """
+    tcgen05_level = get_tcgen05_level_from_global_level(level)
+    pruning_level = get_pruning_level_from_global_level(level)
+    math_instructions_1sm = []
+    math_instructions_2sm = []
+    shapes_1sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_F8F6F4_DENSE_1SM.items() if tcgen05_level >= min_level
+    ]
+    shapes_2sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_F8F6F4_DENSE_2SM.items() if tcgen05_level >= min_level
+    ]
+    for shape in shapes_1sm:
+        if enable_runtime_dtype:
+            math_instructions_1sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.f8, DataType.f8, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+        if enable_compile_time_dtype:
+            math_instructions_1sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.e4m3, DataType.e4m3, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+            math_instructions_1sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.e5m2, DataType.e4m3, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+            math_instructions_1sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.e4m3, DataType.e5m2, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+            if pruning_level >= 2:
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      DataType.e5m2, DataType.e5m2, DataType.f32,
+                      OpcodeClass.TensorOp,
+                      MathOperation.multiply_add)
+                )
+    for shape in shapes_2sm:
+        if enable_runtime_dtype:
+            math_instructions_2sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.f8, DataType.f8, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+        if enable_compile_time_dtype:
+            math_instructions_2sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.e4m3, DataType.e4m3, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+            math_instructions_2sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.e5m2, DataType.e4m3, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+            math_instructions_2sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.e4m3, DataType.e5m2, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+            if pruning_level >= 2:
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      DataType.e5m2, DataType.e5m2, DataType.f32,
+                      OpcodeClass.TensorOp,
+                      MathOperation.multiply_add)
+                )
+    return math_instructions_1sm, math_instructions_2sm
+def generate_f8f6f4_math_instructions_sm100(level: int, enable_runtime_dtype = True, enable_compile_time_dtype = True):
+    """
+    Generate all TensorOp math instructions for FP8 FP6 and FP4 MMA that are supported by SM100 at or above the given level.
+    Args:
+        level: The global level to generate math instructions for.
+        enable_runtime_dtype: Whether to generate runtime dtype math instructions.
+        enable_compile_time_dtype: Whether to generate compile time dtype math instructions.
+    Returns:
+        A tuple of two lists of MathInstruction objects.
+        The first list contains the math instructions for 1SM, and the second list contains the math instructions for 2SM.
+    """
+    tcgen05_level = get_tcgen05_level_from_global_level(level)
+    math_instructions_1sm = []
+    math_instructions_2sm = []
+    shapes_1sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_F8F6F4_DENSE_1SM.items() if tcgen05_level >= min_level
+    ]
+    shapes_2sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_F8F6F4_DENSE_2SM.items() if tcgen05_level >= min_level
+    ]
+    for shape in shapes_1sm:
+        if enable_runtime_dtype:
+            runtime_types = [ DataType.f8, DataType.f6, DataType.f4 ]
+            for a_type, b_type in product(runtime_types, repeat=2):
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.TensorOp,
+                      MathOperation.multiply_add)
+                )
+        if enable_compile_time_dtype:
+            compile_time_types = [ DataType.e4m3, DataType.e5m2, DataType.e3m2, DataType.e2m1 ]
+            for a_type, b_type in product(compile_time_types, repeat=2):
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.TensorOp,
+                      MathOperation.multiply_add)
+                )
+    for shape in shapes_2sm:
+        if enable_runtime_dtype:
+            runtime_types = [ DataType.f8, DataType.f6, DataType.f4 ]
+            for a_type, b_type in product(runtime_types, repeat=2):
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.TensorOp,
+                      MathOperation.multiply_add)
+                )
+        if enable_compile_time_dtype:
+            compile_time_types = [ DataType.e4m3, DataType.e5m2, DataType.e3m2, DataType.e2m1 ]
+            for a_type, b_type in product(compile_time_types, repeat=2):
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.TensorOp,
+                      MathOperation.multiply_add)
+                )
+    return math_instructions_1sm, math_instructions_2sm
+def generate_mxf8f6f4_math_instructions_sm100(level: int, enable_runtime_dtype = True, enable_compile_time_dtype = True):
+    """
+    Generate all BlockScaledTensorOp math instructions for MXFP8, MXFP6, and MXFP4 MMA that are supported by SM100 at or above the given level.
+    Args:
+        level: The global level to generate math instructions for.
+        enable_runtime_dtype: Whether to generate runtime dtype math instructions.
+        enable_compile_time_dtype: Whether to generate compile time dtype math instructions.
+    Returns:
+        A tuple of two lists of MathInstruction objects.
+        The first list contains the math instructions for 1SM, and the second list contains the math instructions for 2SM.
+    """
+    tcgen05_level = get_tcgen05_level_from_global_level(level)
+    pruning_level = get_pruning_level_from_global_level(level)
+    math_instructions_1sm = []
+    math_instructions_2sm = []
+    shapes_1sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_MXF8F6F4_DENSE_1SM.items() if tcgen05_level >= min_level
+    ]
+    shapes_2sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_MXF8F6F4_DENSE_2SM.items() if tcgen05_level >= min_level
+    ]
+    for shape in shapes_1sm:
+        if enable_runtime_dtype:
+            runtime_types = [ DataType.f8, DataType.f6, DataType.f4 ]
+            for a_type, b_type in product(runtime_types, repeat=2):
+                if pruning_level < 2 and ((a_type == DataType.f8 or b_type == DataType.f8)):
+                    continue
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+        if enable_compile_time_dtype:
+            compile_time_types = [ DataType.e4m3,
+                                   DataType.e5m2,
+                                   DataType.e3m2,
+                                   DataType.e2m3,
+                                   DataType.e2m1 ]
+            for a_type, b_type in product(compile_time_types, repeat=2):
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+    for shape in shapes_2sm:
+        if enable_runtime_dtype:
+            runtime_types = [ DataType.f8, DataType.f6, DataType.f4 ]
+            for a_type, b_type in product(runtime_types, repeat=2):
+                if pruning_level < 2 and ((a_type == DataType.f8 or b_type == DataType.f8)):
+                    continue
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+        if enable_compile_time_dtype:
+            compile_time_types = [ DataType.e4m3,
+                                   DataType.e5m2,
+                                   DataType.e3m2,
+                                   DataType.e2m3,
+                                   DataType.e2m1 ]
+            for a_type, b_type in product(compile_time_types, repeat=2):
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+    return math_instructions_1sm, math_instructions_2sm
+def generate_mxf4nvf4_math_instructions_sm100(level: int, enable_runtime_dtype = True, enable_compile_time_dtype = True):
+    """
+    Generate all BlockScaledTensorOp math instructions for MXFP4 and MXFP4 MMA that are supported by SM100 at or above the given level.
+    Args:
+        level: The global level to generate math instructions for.
+        enable_runtime_dtype: Whether to generate runtime dtype math instructions.
+        enable_compile_time_dtype: Whether to generate compile time dtype math instructions.
+    Returns:
+        A tuple of two lists of MathInstruction objects.
+        The first list contains the math instructions for 1SM, and the second list contains the math instructions for 2SM.
+    """
+    tcgen05_level = get_tcgen05_level_from_global_level(level)
+    math_instructions_1sm = []
+    math_instructions_2sm = []
+    shapes_1sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_MXF4NVF4_DENSE_1SM.items() if tcgen05_level >= min_level
+    ]
+    shapes_2sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_MXF4NVF4_DENSE_2SM.items() if tcgen05_level >= min_level
+    ]
+    for shape in shapes_1sm:
+        if enable_runtime_dtype:
+            runtime_types = [ DataType.f4 ]
+            for a_type, b_type in product(runtime_types, repeat=2):
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue4m3)
+                )
+        if enable_compile_time_dtype:
+            compile_time_types = [ DataType.e2m1,
+                                 ]
+            for a_type, b_type in product(compile_time_types, repeat=2):
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue4m3)
+                )
+    for shape in shapes_2sm:
+        if enable_runtime_dtype:
+            runtime_types = [ DataType.f4 ]
+            for a_type, b_type in product(runtime_types, repeat=2):
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue4m3)
+                )
+        if enable_compile_time_dtype:
+            compile_time_types = [ DataType.e2m1,
+                                 ]
+            for a_type, b_type in product(compile_time_types, repeat=2):
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue4m3)
+                )
+    return math_instructions_1sm, math_instructions_2sm
+def generate_cluster_shapes_sm100(level: int, change_priority_func : Union[Callable, None] = None):
+    """
+    Generate all cluster shapes for SM100 at or above the given level.
+    Args:
+        level: The global level to generate cluster shapes for.
+    Returns:
+        A tuple of two lists of cluster shapes.
+        The first list contains the cluster shapes for 1SM, and the second list contains the cluster shapes for 2SM.
+    """
+    cluster_level = get_cluster_level_from_global_level(level)
+    assert cluster_level >= 4
+    if change_priority_func is not None:
+        SM100_CLUSTER_SHAPES_1SM_CPY = copy.deepcopy(SM100_CLUSTER_SHAPES_1SM)
+        SM100_CLUSTER_SHAPES_2SM_CPY = copy.deepcopy(SM100_CLUSTER_SHAPES_2SM)
+        change_priority_func(SM100_CLUSTER_SHAPES_1SM_CPY, SM100_CLUSTER_SHAPES_2SM_CPY)
+        shapes_1sm = [
+            list(shape) for shape, min_level in SM100_CLUSTER_SHAPES_1SM_CPY.items() if cluster_level >= min_level
+        ]
+        shapes_2sm = [
+            list(shape) for shape, min_level in SM100_CLUSTER_SHAPES_2SM_CPY.items() if cluster_level >= min_level
+        ]
+        return shapes_1sm, shapes_2sm
+    else:
+        shapes_1sm = [
+            list(shape) for shape, min_level in SM100_CLUSTER_SHAPES_1SM.items() if cluster_level >= min_level
+        ]
+        shapes_2sm = [
+            list(shape) for shape, min_level in SM100_CLUSTER_SHAPES_2SM.items() if cluster_level >= min_level
+        ]
+        return shapes_1sm, shapes_2sm

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/sm90_shapes.py ADDED Viewed

	@@ -0,0 +1,212 @@

+#################################################################################################
+#
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Valid WGMMA shapes, MMA multipliers, and cluster sizes for SM90, associated with levels.
+These shape and level pairs are defined as dicts, where keys are shapes and values are their
+associated levels. If the user input level for that category (MMA multiplier, WGMMA shape, cluster
+size) is smaller than a shape's associated level, it will be excluded, and otherwise, included.
+Higher levels are therefore less likely emitted, but lower levels are more emitted more frequently.
+Level 0 is always emitted. The default behavior in `generator.py` is that level 1 is only emitted
+when the `--kernel` argument is non-empty.
+"""
+# NOTE: more combinations are possible here.
+# Levels [0, 3] exist in order to control exactly what configs are generated in different dtypes.
+# The rest are only used in the exhaustive mode (when the corresponding level digit is 9).
+# MMA multipliers are multiplied by MMA instruction shapes (WGMMA shapes) to get CTA shapes.
+SM90_MMA_MULTIPLIERS = {
+    (2, 1, 4): 0,
+    (1, 1, 4): 1,
+    (4, 1, 4): 2,
+    (2, 2, 4): 3,
+    (2, 1, 8): 4,
+    (4, 1, 8): 4,
+    (1, 1, 8): 4,
+    (2, 2, 8): 4,
+    (2, 1, 16): 5,
+    (4, 1, 16): 5,
+    (1, 1, 16): 5,
+    (2, 2, 16): 5,
+}
+# Level 0: only (1, 2, 1) -- fp8 dense gemms in pruned case
+# Level 1: clusters with 2 CTAs -- all but fp8 (s8, u8, f16, b16, f32, tf32) dense gemms in pruned case
+# Level 2: clusters with 1 or 2 CTAs
+# Level 3: clusters with 1, 2, or 4 CTAs
+# Level 4: clusters with 1, 2, 4, or 8 CTAs
+# Level 5: clusters with 1, 2, 4, 8, or 16 CTAs
+SM90_CLUSTER_SIZES = {
+    (1, 2, 1): 0,
+    (2, 1, 1): 1,
+    (1, 1, 1): 2,
+    (2, 2, 1): 3,
+    (1, 4, 1): 3,
+    (4, 1, 1): 3,
+    (2, 4, 1): 4,
+    (4, 2, 1): 4,
+    (1, 8, 1): 4,
+    (8, 1, 1): 4,
+    (4, 4, 1): 5,
+}
+# WGMMA shapes
+# Level 0: "default" shape only,
+# Level 1: additional shapes for the unpruned case (tf32 only)
+# Level 2: shapes that are all powers of 2
+# Level 3: all other shapes
+SM90_WGMMA_SHAPES_FP16_BF16_DENSE = {
+    (64, 8, 16): 2,
+    (64, 16, 16): 2,
+    (64, 24, 16): 3,
+    (64, 32, 16): 2,
+    (64, 40, 16): 3,
+    (64, 48, 16): 3,
+    (64, 56, 16): 3,
+    (64, 64, 16): 2,
+    (64, 72, 16): 3,
+    (64, 80, 16): 3,
+    (64, 88, 16): 3,
+    (64, 96, 16): 3,
+    (64, 104, 16): 3,
+    (64, 112, 16): 3,
+    (64, 120, 16): 3,
+    (64, 128, 16): 0,
+    (64, 136, 16): 3,
+    (64, 144, 16): 3,
+    (64, 152, 16): 3,
+    (64, 160, 16): 3,
+    (64, 168, 16): 3,
+    (64, 176, 16): 3,
+    (64, 184, 16): 3,
+    (64, 192, 16): 3,
+    (64, 200, 16): 3,
+    (64, 208, 16): 3,
+    (64, 216, 16): 3,
+    (64, 224, 16): 3,
+    (64, 232, 16): 3,
+    (64, 240, 16): 3,
+    (64, 248, 16): 3,
+    (64, 256, 16): 1,
+}
+SM90_WGMMA_SHAPES_TF32_DENSE = {
+    (64, 8, 8): 2,
+    (64, 16, 8): 2,
+    (64, 24, 8): 3,
+    (64, 32, 8): 2,
+    (64, 40, 8): 3,
+    (64, 48, 8): 3,
+    (64, 56, 8): 3,
+    (64, 64, 8): 2,
+    (64, 72, 8): 3,
+    (64, 80, 8): 3,
+    (64, 88, 8): 3,
+    (64, 96, 8): 3,
+    (64, 104, 8): 3,
+    (64, 112, 8): 3,
+    (64, 120, 8): 3,
+    (64, 128, 8): 0,
+    (64, 136, 8): 3,
+    (64, 144, 8): 3,
+    (64, 152, 8): 3,
+    (64, 160, 8): 3,
+    (64, 168, 8): 3,
+    (64, 176, 8): 3,
+    (64, 184, 8): 3,
+    (64, 192, 8): 3,
+    (64, 200, 8): 3,
+    (64, 208, 8): 3,
+    (64, 216, 8): 3,
+    (64, 224, 8): 3,
+    (64, 232, 8): 3,
+    (64, 240, 8): 3,
+    (64, 248, 8): 3,
+    (64, 256, 8): 1,
+}
+SM90_WGMMA_SHAPES_FP8_DENSE = {
+    (64, 8, 32): 2,
+    (64, 16, 32): 2,
+    (64, 24, 32): 3,
+    (64, 32, 32): 2,
+    (64, 40, 32): 3,
+    (64, 48, 32): 3,
+    (64, 56, 32): 3,
+    (64, 64, 32): 2,
+    (64, 72, 32): 3,
+    (64, 80, 32): 3,
+    (64, 88, 32): 3,
+    (64, 96, 32): 3,
+    (64, 104, 32): 3,
+    (64, 112, 32): 3,
+    (64, 120, 32): 3,
+    (64, 128, 32): 0,
+    (64, 136, 32): 3,
+    (64, 144, 32): 3,
+    (64, 152, 32): 3,
+    (64, 160, 32): 3,
+    (64, 168, 32): 3,
+    (64, 176, 32): 3,
+    (64, 184, 32): 3,
+    (64, 192, 32): 3,
+    (64, 200, 32): 3,
+    (64, 208, 32): 3,
+    (64, 216, 32): 3,
+    (64, 224, 32): 3,
+    (64, 232, 32): 3,
+    (64, 240, 32): 3,
+    (64, 248, 32): 3,
+    (64, 256, 32): 1,
+}
+SM90_WGMMA_SHAPES_INT8_DENSE = {
+    (64, 8, 32): 2,
+    (64, 16, 32): 2,
+    (64, 24, 32): 3,
+    (64, 32, 32): 2,
+    (64, 48, 32): 3,
+    (64, 64, 32): 2,
+    (64, 80, 32): 3,
+    (64, 96, 32): 3,
+    (64, 112, 32): 3,
+    (64, 128, 32): 0,
+    (64, 144, 32): 3,
+    (64, 160, 32): 3,
+    (64, 176, 32): 3,
+    (64, 192, 32): 3,
+    (64, 208, 32): 3,
+    (64, 224, 32): 3,
+    (64, 240, 32): 3,
+    (64, 256, 32): 1,
+}

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/sm90_utils.py ADDED Viewed

	@@ -0,0 +1,753 @@

+#################################################################################################
+#
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for enumerating CUTLASS library SM90 kernels
+"""
+import argparse
+import enum
+from itertools import product
+import math
+import logging
+import os.path
+import shutil
+import sys
+import copy
+from typing import Any, Optional, Sequence, Tuple, List
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+# NOTE: this is a duplicate of CudaToolkitVersionSatisfies in generator.py
+def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
+  # by default, use the latest CUDA Toolkit version
+  cuda_version = [11, 0, 132]
+  # Update cuda_version based on parsed string
+  if semantic_ver_string != '':
+    for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')[:3]]):
+      if i < len(cuda_version):
+        cuda_version[i] = x
+      else:
+        cuda_version.append(x)
+  return cuda_version >= [major, minor, patch]
+#### Step 0: define levels
+# One integer level controls multiple "generators" and how many
+# combinations they generate. That is the "global" level.
+# "Generators" are WGMMA shapes, MMA multipliers, cluster sizes, and
+# anything that is eventually involved in the Cartesian product
+# which yields our kernel configurations.
+# For simplicity, each generator defines their own levels,
+# starting from 0. As a rule we assume 10 or fewer levels, making
+# their level a digit.
+# The "global" level simply stacks these digits and represents them
+# as a single integer.
+#
+# For example, level 500 indicates cluster sizes are at level 5, MMA
+# multipliers are at level 0, and WGMMA shapes are at level 0 as well.
+#
+# Here we define the global level to generator level mappings.
+def get_wgmma_level_from_global_level(global_level: int):
+    return global_level % 10
+def get_mma_level_from_global_level(global_level: int):
+    return (global_level // 10) % 10
+def get_cluster_level_from_global_level(global_level: int):
+    return (global_level // 100) % 10
+def get_pruning_level_from_global_level(global_level: int):
+    return (global_level // 1000) % 10
+#### Step 1: generate MMA instruction shapes based on levels
+try:
+    from .sm90_shapes import (
+        SM90_MMA_MULTIPLIERS,
+        SM90_CLUSTER_SIZES,
+        SM90_WGMMA_SHAPES_TF32_DENSE,
+        SM90_WGMMA_SHAPES_FP16_BF16_DENSE,
+        SM90_WGMMA_SHAPES_FP8_DENSE,
+        SM90_WGMMA_SHAPES_INT8_DENSE,
+    )
+except:
+    from sm90_shapes import (
+        SM90_MMA_MULTIPLIERS,
+        SM90_CLUSTER_SIZES,
+        SM90_WGMMA_SHAPES_TF32_DENSE,
+        SM90_WGMMA_SHAPES_FP16_BF16_DENSE,
+        SM90_WGMMA_SHAPES_FP8_DENSE,
+        SM90_WGMMA_SHAPES_INT8_DENSE,
+    )
+def generate_tf32_math_instruction_shapes_sm90(level: int):
+    assert isinstance(level, int) and level >= 0
+    filtered_list_of_wgmma_shapes = [
+        wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_TF32_DENSE.items() if level >= min_level
+    ]
+    return filtered_list_of_wgmma_shapes
+def generate_fp16_bf16_math_instruction_shapes_sm90(level: int):
+    assert isinstance(level, int) and level >= 0
+    filtered_list_of_wgmma_shapes = [
+        wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_FP16_BF16_DENSE.items() if level >= min_level
+    ]
+    return filtered_list_of_wgmma_shapes
+def generate_fp8_math_instruction_shapes_sm90(level: int):
+    assert isinstance(level, int) and level >= 0
+    filtered_list_of_wgmma_shapes = [
+        wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_FP8_DENSE.items() if level >= min_level
+    ]
+    return filtered_list_of_wgmma_shapes
+def generate_int8_math_instruction_shapes_sm90(level: int):
+    assert isinstance(level, int) and level >= 0
+    filtered_list_of_wgmma_shapes = [
+        wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_INT8_DENSE.items() if level >= min_level
+    ]
+    return filtered_list_of_wgmma_shapes
+def generate_mixed_dtype_math_instructions_shapes_sm90(wgmma_level: int, a_type: DataType, b_type: DataType):
+    # DataTypeSize are in the unit of bits
+    a_bytes = DataTypeSize[a_type] // 8
+    b_bytes = DataTypeSize[b_type] // 8
+    if a_bytes == 4 or b_bytes == 4:
+        return generate_tf32_math_instruction_shapes_sm90(wgmma_level)
+    elif a_bytes == 2 or b_bytes == 2:
+        return generate_fp16_bf16_math_instruction_shapes_sm90(wgmma_level)
+    else:
+        return generate_fp8_math_instruction_shapes_sm90(wgmma_level)
+###########
+def generate_tf32_math_instructions_sm90(level: int):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for math_instruction_shape in generate_tf32_math_instruction_shapes_sm90(wgmma_level):
+        math_instructions.append(
+          MathInstruction(
+              math_instruction_shape,
+              DataType.tf32, DataType.tf32, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+    return math_instructions
+def generate_fp16_bf16_math_instructions_sm90(level: int):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for math_instruction_shape in generate_fp16_bf16_math_instruction_shapes_sm90(wgmma_level):
+        math_instructions += [
+          MathInstruction(
+              math_instruction_shape,
+              DataType.f16, DataType.f16, DataType.f16,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.f16, DataType.f16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.bf16, DataType.bf16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+        ]
+    return math_instructions
+def generate_fp8_math_instructions_sm90(level: int):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for math_instruction_shape in generate_fp8_math_instruction_shapes_sm90(wgmma_level):
+        math_instructions += [
+          MathInstruction(
+              math_instruction_shape,
+              DataType.e4m3, DataType.e4m3, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.e4m3, DataType.e5m2, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.e5m2, DataType.e4m3, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.e5m2, DataType.e5m2, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+        ]
+    return math_instructions
+def generate_mixed_dtype_math_instructions_sm90(level: int, types_of_a_b_acc: List[Tuple[DataType, DataType, DataType]]):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for a_type, b_type, acc_type in types_of_a_b_acc:
+        math_instruction_shapes = generate_mixed_dtype_math_instructions_shapes_sm90(wgmma_level, a_type, b_type)
+        for math_instruction_shape in math_instruction_shapes:
+            math_instructions += [
+                MathInstruction(
+                    math_instruction_shape,
+                    a_type, b_type, acc_type,
+                    OpcodeClass.TensorOp,
+                    MathOperation.multiply_add
+                ),
+            ]
+    return math_instructions
+def generate_int8_math_instructions_sm90(level: int):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for math_instruction_shape in generate_int8_math_instruction_shapes_sm90(wgmma_level):
+        math_instructions += [
+          MathInstruction(
+              math_instruction_shape,
+              DataType.s8, DataType.s8, DataType.s32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.u8, DataType.u8, DataType.s32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+        ]
+    return math_instructions
+def make_sparse_math_instructions(math_instructions):
+    sparse_instructions = []
+    for inst in math_instructions:
+        if inst.opcode_class == OpcodeClass.TensorOp:
+            sparse_instructions.append(MathInstruction(
+                (inst.instruction_shape[0], inst.instruction_shape[1], inst.instruction_shape[2] * 2),
+                inst.element_a, inst.element_b, inst.element_accumulator,
+                OpcodeClass.SparseTensorOp,
+                inst.math_operation),)
+    return sparse_instructions
+#### Step 2: generate tile descriptions from math instruction shapes
+def is_tile_desc_valid(tile_description):
+    if tile_description.minimum_compute_capability != 90 or tile_description.maximum_compute_capability != 90:
+        return False
+    element_a, element_b, element_accum = (
+        tile_description.math_instruction.element_a,
+        tile_description.math_instruction.element_b,
+        tile_description.math_instruction.element_accumulator
+    )
+    cluster_size, cta_shape = (
+        tile_description.cluster_shape,
+        tile_description.threadblock_shape,
+    )
+    grid_size = (
+        cta_shape[0] * cluster_size[0] +
+        cta_shape[1] * cluster_size[1] +
+        cta_shape[2] * cluster_size[2]
+    )
+    num_ctas_in_cluster = cluster_size[0] * cluster_size[1] * cluster_size[2]
+    cluster_shape = (
+        cluster_size[0] * cta_shape[0],
+        cluster_size[1] * cta_shape[1],
+        cluster_size[2] * cta_shape[2]
+    )
+    FP32_TYPES = [DataType.f32, DataType.tf32]
+    FP16_TYPES = [DataType.f16, DataType.bf16]
+    is_fp32 = element_a in FP32_TYPES and element_b in FP32_TYPES
+    is_fp16 = element_a in FP16_TYPES and element_b in FP16_TYPES
+    # Maximum number of CTAs per cluster is 8 for Hopper, but up to 16 is
+    # allowed for non portable clusters.
+    if num_ctas_in_cluster > 16 or num_ctas_in_cluster < 1:
+        return False
+    if grid_size < 1:
+        return False
+    # SM90 WGMMA shapes are always 64 across M, therefore
+    # CTA shape across M must always be a multiple of 64.
+    if cta_shape[0] < 64 or cta_shape[0] % 64 != 0:
+        return False
+    # The minimum WGMMA shape across N is 8, and increments
+    # vary across different dtypes, but they're never smaller
+    # than 8. The minimum CTA shape allowed across N though is 16.
+    if cta_shape[1] < 16 or cta_shape[1] % 8 != 0:
+        return False
+    # SM90 WGMMA shapes across K are always 8 for 32 bit dense
+    # operations, 16 for 16 bit, and 32 for 8 bit. In any case,
+    # the CTA shape across K should be a multiple of 8 and at least
+    # twice the WGMMA shape across K.
+    if cta_shape[2] < 16 or cta_shape[2] % 8 != 0:
+        return False
+    # Minimum of 2 stages (very rough heuristic that may filter out valid kernel configs)
+    if (cluster_shape[0] >= 128 or cluster_shape[1] >= 128) and cluster_shape[2] >= 256:
+        return False
+    if is_fp32 and (cluster_shape[0] >= 128 or cluster_shape[1] >= 128) and cluster_shape[2] >= 128:
+        return False
+    if is_fp32 and cluster_shape[0] >= 256 and cluster_shape[1] >= 256 and cluster_shape[2] >= 64:
+        return False
+    if is_fp16 and cluster_shape[0] >= 256 and cluster_shape[1] >= 256 and cluster_shape[2] >= 128:
+        return False
+    # CTA shape upper bound: <256, 256, 256>
+    if cta_shape[0] > 256 or cta_shape[1] > 256 or cta_shape[2] > 256:
+        return False
+    return True
+def get_mma_multipliers(level: int):
+    assert isinstance(level, int) and level >= 0
+    mma_level = get_mma_level_from_global_level(level)
+    return [
+        mma_mul for mma_mul, mma_min_level in SM90_MMA_MULTIPLIERS.items() if mma_level >= mma_min_level
+    ]
+def get_cluster_sizes(level: int, is_aligned: bool):
+    if not is_aligned:
+        return [(1, 1, 1)]
+    assert isinstance(level, int) and level >= 0
+    cluster_level = get_cluster_level_from_global_level(level)
+    return [
+        cluster_size for cluster_size, cluster_min_level in SM90_CLUSTER_SIZES.items() if cluster_level >= cluster_min_level
+    ]
+def generate_tile_descriptions_sm90(math_instructions, is_aligned: bool, level: int):
+    tile_descriptions = set()
+    mma_multipliers, cluster_sizes = get_mma_multipliers(level), get_cluster_sizes(level, is_aligned)
+    for math_inst, mma_mul, cluster_size in product(math_instructions, mma_multipliers, cluster_sizes):
+        # generator can stamp out duplicate kernels, because it doesn't explicitly set instruction
+        # shape for SM90 kernels, and the 3.X collective API doesn't directly expose them when using
+        # the auto kernel schedule.
+        math_inst_stub = copy.deepcopy(math_inst)
+        math_inst_stub.instruction_shape = [0, 0, 0]
+        tile_desc = TileDescription(
+            threadblock_shape=[
+                math_inst.instruction_shape[0] * mma_mul[0],
+                math_inst.instruction_shape[1] * mma_mul[1],
+                math_inst.instruction_shape[2] * mma_mul[2]
+            ],
+            stages=0,
+            warp_count=[4, 1, 1],
+            math_instruction=math_inst_stub,
+            min_compute=90,
+            max_compute=90,
+            cluster_shape=cluster_size)
+        # For sparse kernels K-tile is twice as large (due to 2x MMA-K size)
+        # Reduce it to same size as dense to afford more smem stages
+        if math_inst.opcode_class == OpcodeClass.SparseTensorOp:
+            tile_desc.threadblock_shape[2] = tile_desc.threadblock_shape[2] // 2
+        if is_tile_desc_valid(tile_desc):
+            tile_descriptions.add(tile_desc)
+    return tile_descriptions
+#### Step 3: map tile description to valid schedules
+def is_tile_desc_compatible_with_cooperative(tile_description):
+    # Cooperative kernels require a minimum CTA-M of 128
+    return tile_description.threadblock_shape[0] % 128 == 0
+def can_tile_desc_use_shmem_in_epilogue(tile_description, data_types):
+    dtype_a, dtype_b, dtype_c, dtype_d, dtype_acc, dtype_epi = (
+        data_types["a_type"],
+        data_types["b_type"],
+        data_types["c_type"],
+        data_types["d_type"],
+        data_types["acc_type"],
+        data_types["epi_type"]
+    )
+    mn = tile_description.threadblock_shape[0] * tile_description.threadblock_shape[1]
+    bitsize_c, bitsize_d = DataTypeSize[dtype_c], DataTypeSize[dtype_d]
+    shmem_bits_c, shmem_bits_d = bitsize_c * mn, bitsize_d * mn
+    shmem_bits_total = shmem_bits_c + shmem_bits_d
+    # Magic number: 2^20
+    # Existing logic suggested that tile shape 256x128 (or 128x256)
+    # would run out of shmem if D is FP32, and source is needed.
+    # That would be 256 * 128 * 32 == 2^21 (~262 KB), which is over the limit.
+    # Hopper's max shmem size is 228 KB, and 2^20 ~= 131 KB.
+    # Since epilogue can't possibly use ALL of the shmem available
+    # we can just settle on 2^20 bits (~ 131 KB) being the upper bound
+    # we would allow for epilogue.
+    # This can be different for non-persistent kernels where epilogue and
+    # mainloop shmem is shared.
+    if shmem_bits_total > 2 ** 20:
+        return False
+    return True
+def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types, layout,
+                        instantiation_level, enable_fp8_fast_acc=True, gemm_kind=GemmKind.Universal3x):
+    # Level 0: prune according to existing generator.py behavior
+    # Level >= 1: no pruning
+    level = get_pruning_level_from_global_level(instantiation_level)
+    schedules = []
+    stream_k_schedules = []
+    if not is_tile_desc_valid(tile_description):
+        return schedules, stream_k_schedules
+    FP16_TYPES = [DataType.f16, DataType.bf16]
+    is_fp16 = data_types["a_type"] in FP16_TYPES and data_types["b_type"] in FP16_TYPES
+    FP8_TYPES = [DataType.e4m3, DataType.e5m2]
+    is_fp8 = data_types["a_type"] in FP8_TYPES and data_types["b_type"] in FP8_TYPES
+    can_do_fp8_fast_accum = is_fp8 and enable_fp8_fast_acc
+    FP32_TYPES = [DataType.f32, DataType.tf32]
+    is_fp32 = data_types["a_type"] in FP32_TYPES and data_types["b_type"] in FP32_TYPES
+    requires_transposed_epilogue = is_fp32 and layout[0][0] == LayoutType.RowMajor and layout[1][0] == LayoutType.RowMajor
+    can_do_cooperative = is_tile_desc_compatible_with_cooperative(tile_description)
+    can_do_tma_epilogue = is_aligned and not requires_transposed_epilogue and can_tile_desc_use_shmem_in_epilogue(tile_description, data_types)
+    default_epilogue = EpilogueScheduleType.NoSmemWarpSpecialized if not requires_transposed_epilogue else EpilogueScheduleType.EpilogueTransposed
+    auto_epilogue = EpilogueScheduleType.ScheduleAuto if not requires_transposed_epilogue else EpilogueScheduleType.EpilogueTransposed
+    cta_m, cta_n, cta_k = (
+        tile_description.threadblock_shape[0],
+        tile_description.threadblock_shape[1],
+        tile_description.threadblock_shape[2]
+    )
+    c_type = data_types["c_type"]
+    d_type = data_types["d_type"]
+    is_void_c = c_type == DataType.void
+    # Filter out invalid kernels
+    is_nt = layout[0][0] == LayoutType.ColumnMajor and layout[1][0] == LayoutType.RowMajor
+    is_tn = layout[0][0] == LayoutType.RowMajor and layout[1][0] == LayoutType.ColumnMajor
+    is_nn = layout[0][0] == LayoutType.ColumnMajor and layout[1][0] == LayoutType.ColumnMajor
+    # static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
+    #   "Copy size must evenly divide SMEM tile.");
+    if is_fp32 and is_nt and (cta_n % cta_k != 0):
+        return [], []
+    # static_assert(!TransposeB || (cutlass::bits_to_bytes((size<1>(SmemLayoutB{}) * sizeof_bits<InternalElementB>::value))) == 128,
+    # "SmemLayoutB K must be 128bytes to be transposed.")
+    if is_fp32 and is_nt and cta_k != 32:
+        return [], []
+    # Static assert failure when instantiating SmemLayoutB
+    if is_fp32 and (is_tn or is_nn) and (cta_n % cta_k != 0):
+        return [], []
+    grouped = is_grouped(gemm_kind)
+    if grouped:
+        # the following cases are unsupported by grouped GEMM
+        if not is_aligned:
+            return [], []
+        if requires_transposed_epilogue:
+            return [], []
+    # Early pruning
+    if level < 1:
+        # Don't stamp out FP16/BF16 kernels smaller than or equal to 64x128x64
+        if is_fp16 and cta_m <= 64 and cta_n <= 128 and cta_k <= 64:
+            return [], []
+        # FP8 configs with CTA tile larger than or equal to 256x128x128 limit data types and schedules
+        is_large_fp8_tile = is_fp8 and cta_m >= 256 and cta_n >= 128 and cta_k >= 128
+        if is_large_fp8_tile:
+            # Only void-C, and only FP8 outputs allowed
+            if not is_void_c or d_type not in FP8_TYPES:
+                return [], []
+            if CudaToolkitVersionSatisfies(cuda_version, 12, 1) and can_do_cooperative and can_do_tma_epilogue:
+                schedules = []
+                if is_blockwise(gemm_kind):
+                    schedules.append(
+                        [
+                            to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative, grouped),
+                            to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                        ])
+                else:
+                    schedules.append(
+                        [
+                            to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperative, grouped),
+                            to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                        ])
+                    schedules.append(
+                        [
+                            to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, grouped),
+                            to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                        ])
+                return schedules, []
+            return [], []
+        if is_fp8 and not is_large_fp8_tile:
+            valid_dtypes_for_c = [DataType.f32, DataType.bf16, DataType.f16, DataType.void]
+            # Prune all configs with fp8 source, and all configs with non-fp8 output
+            # that have different dtypes for source and output.
+            if c_type not in valid_dtypes_for_c or (d_type not in FP8_TYPES and c_type != d_type):
+                return [], []
+        # FP32/TF32 kernels don't stamp out void-C
+        if is_fp32 and is_void_c:
+            return [], []
+    # Void-c only makes a difference for TMA epilogues
+    if is_void_c and not can_do_tma_epilogue:
+        return [], []
+    # For mixed input data types
+    a_type_size = DataTypeSize[data_types["a_type"]]
+    b_type_size = DataTypeSize[data_types["b_type"]]
+    if a_type_size != b_type_size and CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+        schedules = []
+        stream_k_schedules = []
+        epilogue_schedule = EpilogueScheduleType.TmaWarpSpecialized
+        if a_type_size > b_type_size:
+            epilogue_schedule = EpilogueScheduleType.EpilogueTransposed
+        if not is_blockwise(gemm_kind):
+            schedules.append([
+                KernelScheduleType.TmaWarpSpecialized,
+                epilogue_schedule
+            ])
+            schedules.append([
+                KernelScheduleType.TmaWarpSpecializedPingpong,
+                epilogue_schedule
+            ])
+        if cta_m >= 128:
+            if a_type_size > b_type_size:
+                epilogue_schedule = EpilogueScheduleType.EpilogueTransposed
+            else:
+                epilogue_schedule = EpilogueScheduleType.TmaWarpSpecializedCooperative
+            if is_blockwise(gemm_kind):
+                schedules.append([
+                    KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative,
+                    epilogue_schedule
+                ])
+            else:
+                schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    epilogue_schedule
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    epilogue_schedule
+                ])
+        return schedules, stream_k_schedules
+    if not is_aligned and not is_blockwise(gemm_kind):
+        schedules = [[KernelScheduleType.CpAsyncWarpSpecialized,
+                    default_epilogue]]
+        stream_k_schedules = []
+        if CudaToolkitVersionSatisfies(cuda_version, 12, 1) and can_do_cooperative:
+            schedules.append([
+                KernelScheduleType.CpAsyncWarpSpecializedCooperative,
+                default_epilogue
+            ])
+            stream_k_schedules.append([
+                KernelScheduleType.CpAsyncWarpSpecializedCooperative,
+                default_epilogue
+            ])
+        return schedules, stream_k_schedules
+    schedules = []
+    # Pruning: emit Void-C and Grouped kernels with persistent kernels only
+    if (level >= 1 or not is_void_c) and not grouped and not is_blockwise(gemm_kind):
+        # Pruning: don't stamp out fp8 kernels with auto schedule
+        if not is_fp8:
+            schedules.append([KernelScheduleType.ScheduleAuto, auto_epilogue])
+        schedules.append([KernelScheduleType.TmaWarpSpecialized, default_epilogue])
+    stream_k_schedules = []
+    if CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+        if can_do_tma_epilogue:
+            assert not requires_transposed_epilogue
+            # Inconsistency: fp8 pingpong only gets stamped out with fast accum
+            if (not is_fp8 or level >= 1) and not is_blockwise(gemm_kind):
+                schedules.append([
+                    to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedPingpong, grouped),
+                    to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized, grouped)
+                ])
+            if can_do_fp8_fast_accum:
+                schedules.append([
+                    to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, grouped),
+                    to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized, grouped)
+                ])
+    if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+        # Pruning: don't stamp out fp8 ping-pong kernel with non-tma epilogue
+        if not is_fp8 or level >= 1:
+            if not is_blockwise(gemm_kind):
+                schedules.append([to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedPingpong, grouped), to_grouped_schedule(default_epilogue, grouped)])
+            else:
+                schedules.append([to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecializedPingpong, grouped), to_grouped_schedule(default_epilogue, grouped)])
+        if can_do_fp8_fast_accum:
+            if not grouped:
+                schedules.append([KernelScheduleType.TmaWarpSpecializedFP8FastAccum, default_epilogue])
+            schedules.append([to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, grouped), to_grouped_schedule(default_epilogue, grouped)])
+        if can_do_cooperative:
+            if is_blockwise(gemm_kind):
+                schedules.append([
+                    to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative, grouped),
+                    to_grouped_schedule(default_epilogue, grouped)
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative,
+                    default_epilogue
+                ])
+            else:
+                schedules.append([
+                    to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperative, grouped),
+                    to_grouped_schedule(default_epilogue, grouped)
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    default_epilogue
+                ])
+            if can_do_fp8_fast_accum:
+                schedules.append([
+                    to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, grouped),
+                    to_grouped_schedule(default_epilogue, grouped)
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
+                    default_epilogue
+                ])
+        # persistent kernels with TMA epilogues
+        if can_do_tma_epilogue:
+            assert not requires_transposed_epilogue
+            if can_do_cooperative:
+                if is_blockwise(gemm_kind):
+                    schedules.append([
+                        to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative, grouped),
+                        to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                    ])
+                    stream_k_schedules.append([
+                        KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ])
+                else:
+                    schedules.append([
+                        to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperative, grouped),
+                        to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                    ])
+                    stream_k_schedules.append([
+                        KernelScheduleType.TmaWarpSpecializedCooperative,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ])
+                if can_do_fp8_fast_accum:
+                    schedules.append([
+                        to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, grouped),
+                        to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                    ])
+                    stream_k_schedules.append([
+                        KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ])
+    # Grouped GEMM do not support Stream-K scheduler
+    if grouped:
+        return schedules, []
+    return schedules, stream_k_schedules
+#### Misc: helpers
+def generate_data_types_from_math_instruction(math_instruction, element_source = None, element_dest = None, element_epilogue = None):
+    element_a, element_b = math_instruction.element_a, math_instruction.element_b
+    element_accumulator = math_instruction.element_accumulator
+    element_c = element_source or element_accumulator
+    element_d = element_dest or element_accumulator
+    element_epilogue = element_epilogue or element_accumulator
+    data_types = {
+        "a_type"   : element_a,
+        "b_type"   : element_b,
+        "c_type"   : element_c,
+        "d_type"   : element_d,
+        "acc_type" : element_accumulator,
+        "epi_type" : element_epilogue
+    }
+    return data_types
+def fix_alignments(data_types, layout, alignment_bits = 128):
+    operand_keys = ["a_type", "b_type", "c_type"]
+    operands_to_fix = ["c_type"]
+    new_layout = []
+    assert len(layout) == len(operand_keys)
+    for i, k in enumerate(operand_keys):
+        assert k in data_types and data_types[k] in DataTypeSize
+        dtype = data_types[k]
+        dtype_size_bits = DataTypeSize[dtype]
+        layout_type = layout[i][0]
+        layout_alignment = layout[i][1]
+        # Don't modify alignment if dtype's been changed to void
+        if k in operands_to_fix and dtype_size_bits >= 1:
+            layout_alignment = alignment_bits // dtype_size_bits
+        new_layout.append([layout_type, layout_alignment])
+    return new_layout

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/symm_operation.py ADDED Viewed

	@@ -0,0 +1,440 @@

+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for emitting Symm kernels
+"""
+import enum
+import functools
+import operator
+import os.path
+import shutil
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+###################################################################################################
+#
+# Data structure modeling a Symm update operation
+#
+###################################################################################################
+#
+class SymmOperation:
+  #
+  def __init__(self, symm_kind, arch, tile_description, A, B, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      blas_mode = BlasMode.symmetric):
+    self.blas_mode = blas_mode
+    self.operation_kind = OperationKind.Symm
+    self.arch = arch
+    self.tile_description = tile_description
+    self.symm_kind = symm_kind
+    # tensor A and B have same data type and layout
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.swizzling_functor = swizzling_functor
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+    return False
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+  #
+  def is_planar_complex(self):
+    return False
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+    if self.is_complex():
+      return get_complex_from_real(accum)
+    return accum
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
+    }
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
+      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      inst_shape += math_op_string
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    operation_name = 'symm' if self.blas_mode == BlasMode.symmetric else 'hemm'
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, operation_name)
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${element_c}_${core_name}_${element_a}"
+      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${core_name}_${element_a}"
+      else:
+        extended_name = "${core_name}"
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+    return extended_name
+  #
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)]
+      )
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+  #
+  def side_mode_name(self):
+    return "%s" % (ShortSideModeNames[self.A.side_mode])
+  #
+  def fill_mode_name(self):
+    return "%s" % (ShortFillModeNames[self.A.fill_mode])
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    threadblock = self.tile_description.procedural_name()
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    alignment = self.C.alignment
+    return SubstituteTemplate(
+      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${side_mode}_${fill_mode}_align${alignment}",
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'side_mode': self.side_mode_name(),
+        'fill_mode': self.fill_mode_name(),
+        'alignment': "%d" % alignment,
+      }
+    )
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+#
+class EmitSymmUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+  def __init__(self):
+    self.symm_template = """
+// Symm operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::Symm<
+    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation}
+>;
+"""
+    self.symm_complex_template = """
+// Symm operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::Symm<
+    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation},
+    ${blas_mode}
+>;
+"""
+  def emit(self, operation):
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'side_mode': SideModeTag[operation.A.side_mode],
+      'fill_mode': FillModeTag[operation.A.fill_mode],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'split_k_serial': 'false',
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'blas_mode': BlasModeTag[operation.blas_mode]
+    }
+    symm_template = self.symm_complex_template if operation.is_complex() else self.symm_template
+    return SubstituteTemplate(symm_template, values)
+###################################################################################################
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+class EmitSymmConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+    self.instance_emitter = {
+      SymmKind.Universal: EmitSymmUniversalInstance,
+    }
+    self.symm_kind_wrappers = {
+      SymmKind.Universal: 'SymmOperation',
+    }
+    self.instance_template = {
+      SymmKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${symm_kind}<
+    Operation_${operation_name}
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+    }
+    self.header_template = """
+/*
+  Generated by symm_operation.py - Do not edit.
+*/
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "library_internal.h"
+#include "symm_operation.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+    self.initialize_function_template = """
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace library {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+void initialize_${configuration_name}(Manifest &manifest) {
+"""
+    self.epilogue_template = """
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace library
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+    self.instance_definitions = []
+    self.instance_wrappers = []
+    self.operations = []
+    return self
+  def emit(self, operation):
+    emitter = self.instance_emitter[operation.symm_kind]()
+    self.operations.append(operation)
+    self.instance_definitions.append(emitter.emit(operation))
+    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.symm_kind], {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'symm_kind': self.symm_kind_wrappers[operation.symm_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
+      }))
+  def __exit__(self, exception_type, exception_value, traceback):
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+###################################################################################################

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/cutlass_library/trmm_operation.py ADDED Viewed

	@@ -0,0 +1,447 @@

+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for emitting Trmm kernels
+"""
+import enum
+import functools
+import operator
+import os.path
+import shutil
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+###################################################################################################
+#
+# Data structure modeling a TRMM operation
+#
+###################################################################################################
+#
+class TrmmOperation:
+  #
+  def __init__(self, trmm_kind, arch, tile_description, A, B, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8):
+    self.operation_kind = OperationKind.Trmm
+    self.arch = arch
+    self.tile_description = tile_description
+    self.trmm_kind = trmm_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.swizzling_functor = swizzling_functor
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+    return False
+  #
+  def is_planar_complex(self):
+#   return self.trmm_kind in (TrmmKind.PlanarComplex, TrmmKind.PlanarComplexArray)
+    return False
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+    if self.is_complex():
+      return get_complex_from_real(accum)
+    return accum
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
+    }
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
+      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      inst_shape += math_op_string
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, TrmmKindNames[self.trmm_kind])
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${element_c}_${core_name}_${element_a}"
+      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${core_name}_${element_a}"
+      else:
+        extended_name = "${core_name}"
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+    return extended_name
+  #
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+        ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)]
+      )
+    return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout])
+  #
+  def side_mode_name(self):
+    return "%s" % (ShortSideModeNames[self.A.side_mode])
+  #
+  def fill_mode_name(self):
+    return "%s" % (ShortFillModeNames[self.A.fill_mode])
+  #
+  def diag_type_name(self):
+    return "%s" % (ShortDiagTypeNames[self.A.diag_type])
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    threadblock = self.tile_description.procedural_name()
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    alignment = max([self.C.alignment])
+    return SubstituteTemplate(
+      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${side_mode}_${fill_mode}_${diag_type}_align${alignment}",
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'side_mode': self.side_mode_name(),
+        'fill_mode': self.fill_mode_name(),
+        'diag_type': self.diag_type_name(),
+        'alignment': "%d" % self.C.alignment,
+      }
+    )
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+#
+class EmitTrmmUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+  def __init__(self):
+    self.trmm_template = """
+// Trmm operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::Trmm<
+    ${element_a}, ${layout_a},
+    ${side_mode}, ${fill_mode}, ${diag_type},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue},
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation}
+>;
+"""
+    self.trmm_complex_template = """
+// Trmm operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::Trmm<
+    ${element_a}, ${layout_a},
+    ${side_mode}, ${fill_mode}, ${diag_type},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue},
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation},
+    ${transform_a}
+>;
+"""
+  def emit(self, operation):
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'side_mode' : SideModeTag[operation.A.side_mode],
+      'fill_mode': FillModeTag[operation.A.fill_mode],
+      'diag_type' : DiagTypeTag[operation.A.diag_type],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(1),  # TRMM A's alignment is always 1 for no padding to work until we make zfill work with variable bytes
+      'align_b': str(operation.B.alignment),
+      'split_k_serial': 'false',
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'transform_a': ComplexTransformTag[operation.A.complex_transform]
+    }
+    trmm_template = self.trmm_complex_template if operation.is_complex() else self.trmm_template
+    return SubstituteTemplate(trmm_template, values)
+###################################################################################################
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+class EmitTrmmConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+    self.instance_emitter = {
+      TrmmKind.Universal: EmitTrmmUniversalInstance,
+    }
+    self.trmm_kind_wrappers = {
+      TrmmKind.Universal: 'TrmmOperation',
+    }
+    self.instance_template = {
+      TrmmKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${trmm_kind}<
+    Operation_${operation_name}
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+    }
+    self.header_template = """
+/*
+  Generated by trmm_operation.py - Do not edit.
+*/
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "library_internal.h"
+#include "trmm_operation.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+    self.initialize_function_template = """
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace library {
+///////////////////////////////////////////////////////////////////////////////////////////////////
+void initialize_${configuration_name}(Manifest &manifest) {
+"""
+    self.epilogue_template = """
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace library
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
+"""
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+    self.instance_definitions = []
+    self.instance_wrappers = []
+    self.operations = []
+    return self
+  def emit(self, operation):
+    emitter = self.instance_emitter[operation.trmm_kind]()
+    self.operations.append(operation)
+    self.instance_definitions.append(emitter.emit(operation))
+    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.trmm_kind], {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'trmm_kind': self.trmm_kind_wrappers[operation.trmm_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
+      }))
+  def __exit__(self, exception_type, exception_value, traceback):
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+###################################################################################################

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/docs_src/source/conf.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+# -- Path setup --------------------------------------------------------------
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('..'))
+sys.path.insert(0, os.path.abspath('../..'))
+sys.path.insert(0, os.path.abspath('../../media/docs'))
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+project = 'CUTLASS Python interface'
+copyright = '2023, NVIDIA'
+author = 'NVIDIA'
+release = '3.1.0'
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+        'myst_parser',
+        'nbsphinx',
+        'nbsphinx_link',
+        'sphinx_copybutton',
+        'sphinx.ext.autodoc',
+        'sphinx.ext.autosectionlabel',
+        'sphinx.ext.autosummary',
+        'sphinx.ext.coverage',
+        'sphinx.ext.extlinks',
+        'sphinx.ext.ifconfig',
+        'sphinx.ext.intersphinx',
+        'sphinx.ext.mathjax',
+        'sphinx.ext.napoleon',
+        'sphinx.ext.viewcode',
+        'sphinx_inline_tabs',
+        ]
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+autodoc_typehints = 'description'
+pygments_style = "sphinx"
+pygments_dark_style = "monokai"
+templates_path = ['_templates']
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+# Ignore errors when converting notebooks
+nbsphinx_allow_errors = True
+language = 'en'
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+html_static_path = ['_static']
+html_title = "CUTLASS Python"
+html_baseurl = 'docs'
+html_theme = 'furo'
+html_theme_options = {
+	"light_logo": "cutlass-logo-small.png",
+	"dark_logo": "cutlass-logo-small.png",
+    "light_css_variables": {
+        "color-brand-primary": "#76B900",
+        "color-brand-content": "#76B900",
+    },
+    "dark_css_variables": {
+        "color-brand-primary": "#76B900",
+        "color-brand-content": "#76B900",
+    },
+    "footer_icons": [
+        {
+            "name": "GitHub",
+            "url": "https://github.com/NVIDIA/cutlass",
+            "html": """
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            """,
+            "class": "",
+        },
+    ],
+}

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/pycute/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from .int_tuple import *
+from .layout import *
+from .swizzle import *
+from .typing import *

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/pycute/int_tuple.py ADDED Viewed

	@@ -0,0 +1,225 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Functions for manipulating IntTuples
+"""
+from functools import reduce
+from itertools import chain
+from typing import Union
+from .typing import Integer
+def is_int(x):
+  return isinstance(x, Integer)
+def is_tuple(x):
+  return isinstance(x, tuple)
+def flatten(t):
+  if is_tuple(t):
+    if len(t) == 0:
+      return ()
+    else:
+      return tuple(i for a in t for i in flatten(a))
+  else:
+    return (t,)
+def signum(a):
+  return bool(a > 0) - bool(a < 0)
+def product(a):
+  if is_tuple(a):
+    return reduce(lambda val,elem : val*product(elem), a, 1)
+  else:
+    return a
+def inner_product(a, b):
+  if is_tuple(a):                      # tuple tuple
+    assert len(a) == len(b)
+    return sum(inner_product(x,y) for x,y in zip(a,b))
+  else:                                # "int" "int"
+    assert not is_tuple(b)
+    return a * b
+def tuple_max(a):
+  if is_tuple(a):
+    return max(tuple_max(x) for x in a)
+  else:
+    return a
+def elem_scale(a, b):
+  if is_tuple(a):
+    if is_tuple(b):                     # tuple tuple
+      assert len(a) == len(b)
+      return tuple(elem_scale(x,y) for x,y in zip(a,b))
+    else:                               # tuple "int"
+      assert False           # Error
+  else:
+    if is_tuple(b):                     # "int" tuple
+      return elem_scale(a, product(b))
+    else:                               # "int" "int"
+      return a * b
+# Inclusive prefix ceil div with output congruent to input a
+def shape_div(a, b):
+  if is_tuple(a):
+    if is_tuple(b):                    # tuple tuple
+      assert len(a) == len(b)
+      return tuple(shape_div(x,y) for x,y in zip(a,b))
+    else:                              # tuple "int"
+      #r = [shape_div(a[0],b)] + [shape_div(a[i],b := shape_div(b, product(a[i-1]))) for i in range(1,len(a))]
+      r = []
+      for v in a:
+        r.append(shape_div(v,b))
+        b = shape_div(b,product(v))
+      return tuple(r)
+  else:
+    if is_tuple(b):                    # "int" tuple
+      return shape_div(a, product(b))
+    else:                              # "int" "int"
+      assert a % b == 0 or b % a == 0
+      return (a + b - 1) // b
+# Exclusive prefix product with output congruent to input a
+def prefix_product(a, init=1):
+  if is_tuple(a):
+    if is_tuple(init):                 # tuple tuple
+      assert len(a) == len(init)
+      return tuple(prefix_product(x,i) for x,i in zip(a,init))
+    else:                              # tuple "int"
+      #r = [prefix_product(a[0],init)] + [prefix_product(a[i],init := init * product(a[i-1])) for i in range(1,len(a))]
+      r = []
+      for v in a:
+        r.append(prefix_product(v,init))
+        init = init * product(v)
+      return tuple(r)
+  else:
+    if is_tuple(init):                 # "int" tuple
+      assert False           # Error
+    else:                              # "int" "int"
+      return init
+def idx2crd(idx, shape, stride=None):
+  if stride is None:
+    stride = prefix_product(shape)
+  if is_tuple(idx):
+    if is_tuple(shape):                # tuple tuple tuple
+      assert len(idx) == len(shape) and len(idx) == len(stride)
+      return tuple(idx2crd(i, s, d) for i, s, d in zip(idx,shape,stride))
+    else:                              # tuple "int" "int"
+      assert False           # Error
+  else:
+    if is_tuple(shape):                # "int" tuple tuple
+      assert len(shape) == len(stride)
+      return tuple(idx2crd(idx, s, d) for s,d in zip(shape,stride))
+    else:                              # "int" "int" "int"
+      return (idx // stride) % shape
+def crd2idx(crd, shape, stride=None):
+  if stride is None:
+    stride = prefix_product(shape)
+  if is_tuple(crd):
+    if is_tuple(shape):                # tuple tuple tuple
+      assert len(crd) == len(shape) and len(crd) == len(stride)
+      return sum(crd2idx(c, s, d) for c, s, d in zip(crd, shape, stride))
+    else:                              # tuple "int" "int"
+      assert False, f"crd={crd}, shape={shape}"           # Error
+  else:
+    if crd is None:
+      crd = 0
+    if is_tuple(shape):                # "int" tuple tuple
+      assert len(shape) == len(stride)
+      result = 0
+      for i in range(len(shape)-1):
+        result += crd2idx(crd % product(shape[i]), shape[i], stride[i])
+        crd = crd // product(shape[i])
+      return result + crd2idx(crd, shape[-1], stride[-1])
+    else:                              # "int" "int" "int"
+      return crd * stride
+# Transform crd into the dst_shape's iteration space
+def crd2crd(crd, dst_shape, src_shape=None):
+  if is_tuple(crd):
+    if is_tuple(dst_shape):            # tuple tuple
+      assert len(crd) == len(dst_shape)
+      return tuple(crd2crd(x, y) for x, y in zip(crd,dst_shape))
+    else:                              # tuple "int"
+      # Ambiguous unless we have src_shape
+      assert src_shape is not None
+      return crd2idx(crd, src_shape)
+  else:
+    if is_tuple(dst_shape):            # "int" tuple
+      return idx2crd(crd, dst_shape)
+    else:                              # "int" "int"
+      assert crd < dst_shape
+      return crd
+# Filter trg according to crd: keep only elements of trg that are paired with None
+def slice_(crd: Union[None, tuple, int],
+           trg: Union[tuple, int]):
+  if is_tuple(crd):
+    if is_tuple(trg):                  # tuple tuple
+      assert len(crd) == len(trg)
+      # match C++ behavior of `filter_tuple` using `tuple_cat(...)`
+      return tuple(chain(*filter(lambda x: x != (), [slice_(c, s) for c, s in zip(crd, trg)])))
+    else:
+      assert False                     # tuple "int" : Error
+  elif crd is None:
+    # match C++ behavior `return cute::tuple<B>{b};`
+    return (trg,)
+  else:
+    return ()
+# Determine if None appears at any of an int_tuples' terminals
+def has_none(a: Union[None, tuple, int]):
+  if is_tuple(a):
+    return any(has_none(v) for v in a)
+  else:
+    return a is None

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/pycute/layout.py ADDED Viewed

	@@ -0,0 +1,367 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Definition of CuTe Layouts and functions to manipulate them
+"""
+from itertools import chain
+from typing import Union
+from .int_tuple import *
+class LayoutBase:
+  pass
+def is_layout(x):
+  return isinstance(x, LayoutBase)
+class Layout(LayoutBase):
+  def __init__(self, _shape, _stride=None):
+    self.shape  = _shape
+    if _stride is None:
+      self.stride = prefix_product(self.shape)
+    else:
+      self.stride = _stride
+  # operator ==
+  def __eq__(self, other):
+    return self.shape == other.shape and self.stride == other.stride
+  # operator len(L)  (len [rank] like tuples)
+  def __len__(self):
+    if is_tuple(self.shape):
+      return len(self.shape)
+    else:
+      return 1
+  # operator ()    (map coord to idx)
+  def __call__(self, *args):
+    """
+    Map a logical coordinate to a linear index (Coord has no Underscore slice operators)
+    OR
+    Slice the layout and return the sublayout (Coord has an Underscore slice op)
+    Follow the same behavior of `Layout::operator(Coord const&)` in cute C++
+    """
+    if has_none(args):
+      if len(args) == 1:
+        return Layout(slice_(args[0], self.shape), slice_(args[0], self.stride))
+      else:
+        return Layout(slice_(args, self.shape), slice_(args, self.stride))
+    else:
+      if len(args) == 1:
+        return crd2idx(args[0], self.shape, self.stride)
+      else:
+        return crd2idx(args, self.shape, self.stride)
+  # operator []    (get-i like tuples)
+  def __getitem__(self, i):
+    if is_tuple(self.shape):
+      return Layout(self.shape[i], self.stride[i])
+    else:
+      assert i == 0
+      return Layout(self.shape, self.stride)
+  # size(layout)   Size of the domain
+  def size(self):
+    return product(self.shape)
+  # cosize(layout)   Size of the codomain
+  def cosize(self):
+    return self(self.size() - 1) + 1
+  # print and str
+  def __str__(self):
+    return f"{self.shape}:{self.stride}"
+  # error msgs and representation
+  def __repr__(self):
+    return f"Layout({self.shape},{self.stride})"
+# Make Layout from a list of layouts (each layout it's own mode in the result)
+def make_layout(*layouts):
+  if len(layouts) == 1 and not is_layout(layouts[0]):
+    layouts = layouts[0]
+  shape, stride = zip(*((a.shape,a.stride) for a in layouts))
+  return Layout(shape, stride)
+# Size of the domain
+def size(layout):
+  if is_layout(layout):
+    return layout.size()
+  return product(layout)
+# Size of the codomain
+def cosize(layout):
+  return layout.cosize()
+# Layout coalesce -- flatten and combine as many modes as possible while preserving the int-to-int function
+def coalesce(layout, profile=None):
+  if is_tuple(profile):
+    assert len(layout) >= len(profile)
+    return make_layout(chain((coalesce(layout[i], profile[i]) for i in range(           0,len(profile))),
+                             (layout[i]                       for i in range(len(profile),len(layout)))))
+  result_shape  = [1]
+  result_stride = [0]
+  for (shape,stride) in zip(flatten(layout.shape),flatten(layout.stride)):
+    # skip their shape-1s
+    if shape == 1:
+      continue
+    # replace our shape-1 with anything
+    elif result_shape[-1] == 1:
+      result_shape[-1]  = shape
+      result_stride[-1] = stride
+    # merge modes if the shape*stride match
+    elif result_shape[-1] * result_stride[-1] == stride:
+      result_shape[-1] = result_shape[-1] * shape
+    # append a new mode
+    else:
+      result_shape.append(shape)
+      result_stride.append(stride)
+  if len(result_shape) == 1:
+    return Layout(result_shape[0], result_stride[0])
+  else:
+    return Layout(tuple(result_shape), tuple(result_stride))
+# Layout filter -- replace all stride-0 modes with size-1 and then coalesce to remove them
+def filter(layout, profile=None):
+  if is_tuple(profile):
+    assert len(layout) >= len(profile)
+    return make_layout(chain((filter(layout[i], profile[i]) for i in range(           0,len(profile))),
+                             (layout[i]                     for i in range(len(profile),len(layout)))))
+  result_shape  = []
+  result_stride = []
+  for (shape,stride) in zip(flatten(layout.shape),flatten(layout.stride)):
+    # skip their shape-1s and stride-0s
+    if not (shape == 1 or stride == 0):
+      result_shape.append(shape)
+      result_stride.append(stride)
+  if len(result_shape) == 0:
+    return Layout(1,0)
+  else:
+    return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+# Layout composition
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def composition(layoutA, layoutB):
+  if layoutB is None:
+    return layoutA
+  elif is_int(layoutB):
+    return composition(layoutA, Layout(layoutB))
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    return make_layout(chain((composition(layoutA[i], layoutB[i]) for i in range(           0,len(layoutB))),
+                             (layoutA[i]                          for i in range(len(layoutB),len(layoutA)))))
+  elif is_tuple(layoutB.shape):
+    return make_layout(composition(layoutA, layoutB_i) for layoutB_i in layoutB)
+  if layoutB.stride == 0:
+    return Layout(layoutB.shape, 0)
+  else:
+    result_shape  = []
+    result_stride = []
+    rest_shape    = layoutB.shape
+    rest_stride   = layoutB.stride
+    flat_A = coalesce(layoutA)
+    for (curr_shape, curr_stride) in zip(flatten(flat_A.shape)[:-1], flatten(flat_A.stride)[:-1]):
+      assert curr_shape % rest_stride == 0 or rest_stride % curr_shape == 0
+      new_shape = min(max(1, curr_shape // rest_stride), rest_shape)
+      if new_shape != 1:
+        result_shape.append(new_shape)
+        result_stride.append(rest_stride * curr_stride)
+      rest_shape  = rest_shape // new_shape
+      rest_stride = -(-rest_stride // curr_shape)  # Python exclusive impl: "//" is always floor div so == ceil_div(abs(rest_stride), curr_shape) * signum(rest_stride)
+    if rest_shape != 1 or len(result_shape) == 0:
+      result_shape.append(rest_shape)
+      result_stride.append(rest_stride * flatten(flat_A.stride)[-1])
+    if len(result_shape) == 1:
+      return Layout(result_shape[0], result_stride[0])
+    else:
+      return Layout(tuple(result_shape), tuple(result_stride))
+# Layout complement
+def complement(layout, max_idx=1):
+  if is_int(layout):
+    return complement(Layout(layout))
+  result_shape  = []
+  result_stride = []
+  current_idx = 1
+  sorted_DS = sorted(zip(flatten(layout.stride), flatten(layout.shape)))
+  for (stride, shape) in sorted_DS:
+    if stride == 0 or shape == 1:
+      continue
+    in_bound = current_idx <= shape * stride
+    # To support symbolic value which can't be evaluated now
+    assert (type(in_bound) is not bool) or in_bound
+    result_shape.append(stride // current_idx)
+    result_stride.append(current_idx)
+    current_idx = shape * stride
+  result_shape.append((max_idx + current_idx - 1) // current_idx)  # ceil_div
+  result_stride.append(current_idx)
+  return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+# Layout right inverse
+def right_inverse(layout):
+  if layout is None:
+    return None
+  elif is_int(layout):
+    return Layout(layout)
+  result_shape  = []
+  result_stride = []
+  current_idx = 1
+  flat_shape  = flatten(layout.shape)
+  flat_stride = flatten(layout.stride)
+  sorted_DSA = sorted(zip(flat_stride, flat_shape, prefix_product(flat_shape)))
+  for (stride,shape,rstride) in sorted_DSA:
+    if shape == 1:
+      continue
+    if current_idx != stride:
+      break
+    result_shape.append(shape)
+    result_stride.append(rstride)
+    current_idx = shape * stride
+  return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+# Layout left inverse
+def left_inverse(layout):
+  if layout is None:
+    return None
+  elif is_int(layout):
+    return Layout(layout)
+  return right_inverse(make_layout(layout, complement(layout)))
+# Split a layout by the composition of B and the "rest"
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def logical_divide(layoutA, layoutB):
+  if layoutB is None:
+    return layoutA
+  elif is_int(layoutB):
+    return logical_divide(layoutA, Layout(layoutB))
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    return make_layout(chain((logical_divide(layoutA[i], layoutB[i]) for i in range(           0,len(layoutB))),
+                             (layoutA[i]                             for i in range(len(layoutB),len(layoutA)))))
+  return composition(layoutA, make_layout(layoutB, complement(layoutB, size(layoutA))))
+# Reproduce a layoutA over a layoutB
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def logical_product(layoutA, layoutB):
+  if layoutB is None:
+    return layoutA
+  elif is_int(layoutB):
+    return logical_divide(layoutA, Layout(layoutB))
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    return make_layout(chain((logical_product(layoutA[i], layoutB[i]) for i in range(           0,len(layoutB))),
+                             (layoutA[i]                              for i in range(len(layoutB),len(layoutA)))))
+  return make_layout(layoutA, composition(complement(layoutA, size(layoutA)*cosize(layoutB)), layoutB));
+# Gather the modes from a hierarchical logical_divide or logical_product
+def hier_unzip(splitter, layoutA, layoutB):
+  if layoutB is None:
+    return make_layout(Layout(1,0), layoutA)
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    # A layout with shape ((A,a),(B,b),(C,c))
+    split = make_layout(hier_unzip(splitter, layoutA[i], layoutB[i]) for i in range(0,len(layoutB)))
+    # Gather to shape ((A,B,C,...),(a,b,c,...,y,z))
+    return make_layout(make_layout(       split[i][0] for i in range(           0,len(layoutB))),
+                       make_layout(chain((split[i][1] for i in range(           0,len(layoutB))),
+                                         (layoutA[i]  for i in range(len(layoutB),len(layoutA))))))
+  # splitter must return a rank-2 layout
+  return splitter(layoutA, layoutB)
+# Apply logical divide hierarchically and gather the split modes into two modes
+def zipped_divide(layoutA, layoutB):
+  return hier_unzip(logical_divide, layoutA, layoutB)
+# Perform logical divide hierarchically and gather tiles (B-layouts) into a new mode
+def tiled_divide(layoutA, layoutB):
+  result = zipped_divide(layoutA, layoutB)
+  return make_layout([result[0]] + [result[1][i] for i in range(len(result[1]))])
+# Apply logical product hierarchically and gather the split modes into two modes
+def zipped_product(layoutA, layoutB):
+  return hier_unzip(logical_product, layoutA, layoutB)
+# Perform logical product hierarchically and gather tiles (B-layouts) into a new mode
+def tiled_product(layoutA, layoutB):
+  result = zipped_product(layoutA, layoutB)
+  return make_layout([result[0]] + [result[1][i] for i in range(len(result[1]))])
+def slice_and_offset(crd: tuple,
+                     layout: Layout):
+  return (Layout(slice_(crd, layout.shape), slice_(crd, layout.stride)),
+          crd2idx(crd, layout.shape, layout.stride))

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/pycute/swizzle.py ADDED Viewed

	@@ -0,0 +1,129 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Methods for layout swizzling
+"""
+from .layout import *
+def shiftr(a, s):
+  return a >> s if s > 0 else shiftl(a, -s)
+def shiftl(a, s):
+  return a << s if s > 0 else shiftr(a, -s)
+## A generic Swizzle functor
+ # 0bxxxxxxxxxxxxxxxYYYxxxxxxxZZZxxxx
+ #                               ^--^  Base is the number of least-sig bits to keep constant
+ #                  ^-^       ^-^      Bits is the number of bits in the mask
+ #                    ^---------^      Shift is the distance to shift the YYY mask
+ #                                       (pos shifts YYY to the right, neg shifts YYY to the left)
+ #
+ # e.g. Given
+ # 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxZZxxx
+ # the result is
+ # 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxAAxxx where AA = ZZ xor YY
+ #
+class Swizzle:
+  def __init__(self, bits, base, shift):
+    assert bits >= 0
+    assert base >= 0
+    assert abs(shift) >= bits
+    self.bits = bits
+    self.base = base
+    self.shift = shift
+    bit_msk = (1 << bits) - 1
+    self.yyy_msk = bit_msk << (base + max(0,shift))
+    self.zzz_msk = bit_msk << (base - min(0,shift))
+  # operator ()    (transform integer)
+  def __call__(self, offset):
+    return offset ^ shiftr(offset & self.yyy_msk, self.shift)
+  # Size of the domain
+  def size(self):
+    return 1 << (self.bits + self.base + abs(self.shift))
+  # Size of the codomain
+  def cosize(self):
+    return self.size()
+  # print and str
+  def __str__(self):
+    return f"SW_{self.bits}_{self.base}_{self.shift}"
+  # error msgs and representation
+  def __repr__(self):
+    return f"Swizzle({self.bits},{self.base},{self.shift})"
+class ComposedLayout(LayoutBase):
+  def __init__(self, layoutB, offset, layoutA):
+    self.layoutB = layoutB
+    self.offset  = offset
+    self.layoutA = layoutA
+  # operator ==
+  def __eq__(self, other):
+    return self.layoutB == other.layoutB and self.offset == other.offset and self.layoutA == other.layoutA
+  # operator len(L)  (len [rank] like tuples)
+  def __len__(self):
+    return len(self.layoutA)
+  # operator ()    (map coord to idx)
+  def __call__(self, *args):
+    return self.layoutB(self.offset + self.layoutA(*args))
+  # operator []    (get-i like tuples)
+  def __getitem__(self, i):
+    return ComposedLayout(self.layoutB, self.offset, self.layoutA[i])
+  # size(layout)   Size of the domain
+  def size(self):
+    return size(self.layoutA)
+  # cosize(layout)   Size of the codomain
+  def cosize(self):
+    return cosize(self.layoutB)
+  # print and str
+  def __str__(self):
+    return f"{self.layoutB} o {self.offset} o {self.layoutA}"
+  # error msgs and representation
+  def __repr__(self):
+    return f"ComposedLayout({repr(self.layoutB)},{repr(self.offset)},{repr(self.layoutA)})"

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/pycute/typing.py ADDED Viewed

	@@ -0,0 +1,42 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from abc import ABC
+class Integer(ABC):
+    @classmethod
+    def __subclasshook__(cls, c):
+        if c in [bool, float]:
+            return False
+        return issubclass(c, int)

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/setup_cutlass.py ADDED Viewed

	@@ -0,0 +1,74 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+import copy
+import os
+import setuptools
+from setuptools import setup
+from setuptools.command.build_ext import build_ext
+import setup_pycute
+import setup_library
+# Install cutlass_library package
+setup_library.perform_setup()
+# Install the PyCuTe package
+setup_pycute.perform_setup()
+setup(
+    name='cutlass_cppgen',
+    version='4.2.0',
+    description='CUTLASS Pythonic Interface',
+    package_dir={'': '.'},
+    packages=[
+        'cutlass_cppgen',
+        'cutlass_cppgen.emit',
+        'cutlass_cppgen.op',
+        'cutlass_cppgen.utils',
+        'cutlass_cppgen.backend',
+        'cutlass_cppgen.backend.utils'
+        ],
+    setup_requires=['pybind11'],
+    install_requires=[
+        'bfloat16',
+        'cuda-python>=11.8.0',
+        'pybind11',
+        'scikit-build',
+        'treelib',
+        'pydot'
+        ]
+)

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/setup_library.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from setuptools import setup
+def perform_setup():
+    setup(
+        name='cutlass_library',
+        version='4.2.1',
+        description='CUTLASS library generation scripts',
+        packages=['cutlass_library']
+    )
+if __name__ == '__main__':
+    perform_setup()

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/python/setup_pycute.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from setuptools import setup
+def perform_setup():
+    setup(
+        name='pycute',
+        version='4.2.1',
+        description='Python implementation of CuTe',
+        packages=['pycute'],
+    )
+if __name__ == '__main__':
+    perform_setup()

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_problem_sizes.py ADDED Viewed

	@@ -0,0 +1,661 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utilities for defining Conv2D problem sizes for testing.
+This file was ported from the C++ version in test/unit/conv/device/conv2d_problems.h
+"""
+from cutlass_library import ConvMode
+import cutlass_cppgen
+from cutlass_cppgen.shape import Conv2DProblemSize
+class TestbedConv2dProblemSizes:
+    def __init__(self, minimum_channel_size: int):
+        conv2d_default_sizes = self.initialize_conv2d_default_sizes(minimum_channel_size)
+        conv2d_rigorous_sizes = self.initialize_conv2d_rigorous_sizes(minimum_channel_size)
+        conv2d_resnet50_sizes = self.initialize_conv2d_resnet50_sizes(1)
+        conv2d_resnet50_sizes_perf = self.initialize_conv2d_resnet50_sizes(34)
+        grouped_sizes = self.initialize_conv2d_grouped_sizes()
+        # Filter all problems
+        self.all = []
+        for size_list in [conv2d_default_sizes, conv2d_rigorous_sizes, conv2d_resnet50_sizes, conv2d_resnet50_sizes_perf, grouped_sizes]:
+            for size in size_list:
+                if (size.C // size.groups) % minimum_channel_size == 0:
+                    self.all.append(size)
+    def initialize_conv2d_default_sizes(self, minimum_channel_size):
+        # Small input size x stride (1,1)
+        # C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        conv2d_default_sizes = []
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 1, 1, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 1, 8, minimum_channel_size,
+          8, 1, 3, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 7, 8, minimum_channel_size,
+          8, 3, 3, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 7, 9, minimum_channel_size,
+          8, 4, 4, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          2, 7, 9, minimum_channel_size,
+          8, 5, 5, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 6, 5, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 6, 6, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 7, 7, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        ##############################################
+        # Small input size x stride (2,2)
+        # C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        ##############################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 11, 7, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 11, 7, minimum_channel_size,
+          8, 3, 3, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 11, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 17, 19, minimum_channel_size,
+          16, 2, 2, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 5, minimum_channel_size,
+          16, 3, 3, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 17, 8,
+          24, 3, 3, 8,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 21, 8,
+          24, 3, 3, 8,
+          1, 1,
+          3, 3,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 20, 24, 8,
+          40, 3, 3, 8,
+          3, 3,
+          3, 3,
+          1, 1,
+        ))
+        ##########################################
+        # Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1)
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 15, 19, 160,
+          224, 1, 1, 160,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 19, 37, 160,
+          224, 3, 3, 160,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 16, 16, 160,
+          224, 2, 3, 160,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 21, 128,
+          224, 3, 3, 128,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 29, 37, 160,
+          224, 5, 5, 160,
+          2, 2,
+          1, 1,
+          1, 1,
+        ))
+        ##########################################
+        # C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 15, 19, 32 + minimum_channel_size,
+          96, 3, 3, 32 + minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 16, 24, 64 + minimum_channel_size,
+          96, 3, 3, 64 + minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        ##########################################
+        # Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 16, 288,
+          160, 5, 5, 288,
+          2, 2,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 55, 51, 256,
+          512, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 71, 80, 32,
+          64, 5, 5, 32,
+          2, 2,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 224, 224, 8,
+          64, 7, 7, 8,
+          3, 3,
+          2, 2,
+          1, 1,
+        ))
+        ##########################################
+        # Medium input size stride (3, 3), filter (3, 3), non-default padding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 23, 256,
+          512, 3, 3, 256,
+          0, 0,
+          3, 3,
+          1, 1,
+        ))
+        ##########################################
+        # Medium input size padding > stride, asymmetric filter, padding and striding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 31, 256,
+          512, 3, 3, 256,
+          5, 7,
+          3, 4,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 35, 256,
+          512, 7, 5, 256,
+          11, 7,
+          3, 5,
+          1, 1,
+        ))
+        ##########################################
+        # Medium input size *mixed* stride (1, 2) and (2, 1),
+        # filter (3, 3), default padding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 27, 256,
+          512, 3, 3, 256,
+          1, 1,
+          1, 2,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 27, 256,
+          512, 3, 3, 256,
+          1, 1,
+          2, 1,
+          1, 1,
+        ))
+        ######################################/
+        # Additional input size
+        ######################################/
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 28, 28, 256,
+          256, 2, 2, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+           1, 32, 32, 16,
+           32, 3, 3, 16,
+           1, 1,
+           6, 2,
+           1, 1,
+         ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          32, 24, 32, 32,
+          32, 1, 2, 32,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          4, 2, 3, 256,
+          328, 3, 5, 256,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        return conv2d_default_sizes
+    # Add a few large and rigorous convolution problem sizes
+    def initialize_conv2d_rigorous_sizes(self, minimum_channel_size):
+        sizes = []
+        if False:
+            sizes.append(Conv2DProblemSize.from_sizes(
+              (1, 124, 224, 2 * minimum_channel_size),
+              (24, 7, 7, 2 * minimum_channel_size),
+            ))
+            sizes.append(Conv2DProblemSize.from_sizes(
+              (1, 233, 35, minimum_channel_size),
+              (24, 7, 5, minimum_channel_size),
+            ))
+        return sizes
+    # Add resent50 layers to unit testing sizes
+    def initialize_conv2d_resnet50_sizes(self, batch_size):
+        conv2d_problem_vector = []
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          256, 1, 1, 64,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          64, 1, 1, 64,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          64, 3, 3, 64,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          64, 1, 1, 256,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          512, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          128, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 128,
+          128, 3, 3, 128,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 128,
+          512, 1, 1, 128,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          128, 1, 1, 512,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          1024, 1, 1, 512,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          256, 1, 1, 512,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 256,
+          256, 3, 3, 256,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 256,
+          1024, 1, 1, 256,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          256, 1, 1, 1024,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          2048, 1, 1, 1024,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          512, 1, 1, 1024,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 512,
+          512, 3, 3, 512,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 512,
+          2048, 1, 1, 512,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 2048,
+          512, 1, 1, 2048,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+        return conv2d_problem_vector
+    def initialize_conv2d_grouped_sizes(self):
+        threadblock_n = 128
+        threadblock_k = 32
+        sizes = []
+        ##########################################
+        # One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
+        # One CTA calculates a single group
+        ##########################################
+        for cta_per_group_k in range(1, 4):
+            for groups in range(2, 5):
+                conv_k = cta_per_group_k * threadblock_n * groups
+                sizes.append(Conv2DProblemSize(
+                  1, 8, 8, threadblock_k * 2 * groups,
+                  conv_k, 3, 3, threadblock_k * 2,
+                  1, 1,
+                  1, 1,
+                  1, 1,
+                  ConvMode.CrossCorrelation,
+                  1,
+                  groups
+                ))
+        # Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k,
+          threadblock_n * 2, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+        sizes.append(Conv2DProblemSize(
+          1, 56, 56, 696,
+          768, 3, 3, 232,
+          1, 1,
+          2, 2,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          3
+        ))
+        sizes.append(Conv2DProblemSize(
+          1, 14, 14, 1392,
+          1536, 3, 3, 232,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          3
+        ))
+        ##########################################
+        # One CTA calculate multiple groups: CTA::N % k_per_group = 0
+        ##########################################
+        # 2 groups per CTA
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 4,
+          threadblock_n, 3, 3, threadblock_k * 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+        # 2 groups per CTA and partial gemm_k
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k,
+          threadblock_n, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+        # 4 groups per CTA
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 8,
+          threadblock_n // 2, 3, 3, threadblock_k * 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          4
+        ))
+        # 4 groups per CTA and partial gemm_k
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 2,
+          threadblock_n // 2, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          4
+        ))
+        return sizes

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_sm80.py ADDED Viewed

	@@ -0,0 +1,146 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Low-level functionality tests for Conv2d opreations on SM80
+"""
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+from conv2d_test_utils import *
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 80
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is invalid for SM80 tests.')
+class Conv2dSm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+conv_problems = get_conv_problems()
+# Tests for optimized & analytic
+for conv_kind in ["fprop", "wgrad", "dgrad"]:
+    # F16, simt
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="simt", threadblock_shape=[128, 128, 8],
+        warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
+    # F16, tensor op
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
+    # F16, tensor op, analytic iterator
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
+    # F16, tensor op, f32 output
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
+    # F16, tensor op, different tile description
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 64, 32],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
+    # F32, simt
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32,
+        opclass="simt", threadblock_shape=[128, 128, 8],
+        warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
+    # Tf32, tensorop
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32,
+        opclass="tensor_op", threadblock_shape=[128, 128, 16],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8]
+    )
+    # Split-K
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
+        split_k_slices=2)
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
+        split_k_slices=5)
+    # Swizzling functor
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 64, 32],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)
+# Tests for few channels and fixed channels
+# F16, tensor op, few channels
+for c, tb, stage, inst in zip([2, 1],
+                                [[128, 128, 64], [128, 128, 32]],
+                                [3, 2],
+                                [[16, 8, 16], [16, 8, 8]]):
+    add_test(
+        Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=tb,
+        warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
+    )
+# F16, tensor op, fixed channels
+for c in [8, 4, 2]:
+    add_test(
+        Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
+    )
+# Test activations
+for activation in ["relu", "leaky_relu"]:
+    for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
+        add_test(
+            Conv2dSm80, cc, "fprop", conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+            opclass="tensor_op", threadblock_shape=[128, 128, 64],
+            warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
+            split_k_slices=split_k_slices, activation=activation)
+if __name__ == '__main__':
+    unittest.main()

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_test_utils.py ADDED Viewed

	@@ -0,0 +1,428 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Utility functions for Conv2d tests.
+"""
+from cutlass_library import SubstituteTemplate
+import torch
+import cutlass_cppgen
+from cutlass_library import (
+    ConvKind,
+    ConvMode,
+    DataType,
+    DataTypeNames,
+    EpilogueScheduleSuffixes,
+    KernelScheduleSuffixes,
+    LayoutType,
+    OpcodeClassNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames,
+    SplitKMode,
+)
+from cutlass_cppgen.shape import Conv2DProblemSize
+from cutlass_cppgen.utils.datatypes import numpy_type, torch_type
+from conv2d_problem_sizes import TestbedConv2dProblemSizes
+def get_name_conv2d(
+    arch,
+    conv_kind,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm,
+    swizzle,
+    split_k_mode,
+    split_k_slices,
+    activation
+):
+    """
+    Generates a procedural name for a test case for conv2d
+    :param arch: compute capability of kernel being generated
+    :type arch: int
+    :param conv_kind: the convolution type (i.e. fprop, dgrad, wgrad)
+    :type conv_kind: str
+    :param iterator_algorithm: the iterator algorithm applied
+    :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
+    :param element_a: data type of operand A
+    :param element_b: data type of operand B
+    :param element_c: data type of operand C
+    :param element_accumulator: data type used in accumulation
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass_cppgen.OpcodeClass
+    :param threadblock_shape: indexable container of dimensions of threadblock tiles
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param stride_support: stride support of dgrad
+    :param alignment: int
+    :type alignment: int
+    :return: str
+    """
+    if iterator_algorithm is None:
+        iterator_algorithm = "AUTO"
+    if swizzle is None:
+        swizzle = 1
+    name_format = "test_SM${arch}_Device_Conv2d_${conv_kind}_${iter_alg}_ImplicitGemm_${eA}nhwc_${eB}nhwc_${eC}nhwc_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${wM}x${wN}x${wK}_${IM}${IN}${IK}_stage${stages}_swizzle${swizzle}_${split_k_mode}${split_k_slices}_${activation}"
+    return SubstituteTemplate(
+        name_format,
+        {
+            "arch": str(arch),
+            "conv_kind": conv_kind,
+            "iter_alg": iterator_algorithm,
+            "eA": DataTypeNames[element],
+            "eB": DataTypeNames[element],
+            "eC": DataTypeNames[element_output],
+            "opclass": opclass,
+            "acc": DataTypeNames[element_accumulator],
+            "tbM": str(threadblock_shape[0]),
+            "tbN": str(threadblock_shape[1]),
+            "tbK": str(threadblock_shape[2]),
+            "wM": str(threadblock_shape[0] // warp_count[0]),
+            "wN": str(threadblock_shape[1] // warp_count[1]),
+            "wK": str(threadblock_shape[2] // warp_count[2]),
+            "IM": str(instruction_shape[0]),
+            "IN": str(instruction_shape[1]),
+            "IK": str(instruction_shape[2]),
+            "stages": str(stages),
+            "swizzle": str(swizzle),
+            "split_k_mode": split_k_mode,
+            "split_k_slices": str(split_k_slices),
+            "activation": activation
+        }
+    )
+def conv2d_few_channel_problemsizes(channels):
+    problem_sizes = [
+        Conv2DProblemSize(
+            1, 8, 8, channels,
+            16, 3, 3, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 16, 16, channels,
+            16, 3, 3, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 16, 16, channels,
+            16, 7, 7, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            32, 7, 7, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 7, 7, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 5, 5, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 5, 5, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+    ]
+    return problem_sizes
+def validate_problem_size(ps, conv_kind, split_k_slices):
+    P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
+    Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
+    if P != ps.P or Q != ps.Q:
+        return False
+    # Split-K (serial or parallel) is not supported for strided dgrad
+    if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
+        return False
+    return True
+class Conv2dLauncherFrontend:
+    def __init__(self, plan: cutlass_cppgen.Conv2d, seed: int = 80, backend="numpy"):
+        self.operation = plan
+        self.conv_kind = plan.conv_kind
+        self.seed = seed
+        self.backend = backend
+        self.dtype_A = plan._element_a
+        self.dtype_B = plan._element_b
+        self.dtype_C = plan._element_c
+        self.dtype_acc = plan._element_accumulator
+        self.layout_A = LayoutType.TensorNHWC
+        self.layout_B = LayoutType.TensorNHWC
+        self.layout_C = LayoutType.TensorNHWC
+        self.layout_D = LayoutType.TensorNHWC
+        self.element_compute = DataType.f32
+        if self.dtype_A in [cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.bf16]:
+            self.rand_max = 1
+        else:
+            self.rand_max = 4
+        self.activation = plan.activation
+    def uniform_init(self, size, dtype):
+        tensor = torch.ceil(
+            torch.empty(size=size, dtype=torch_type(dtype), device="cuda").uniform_(-self.rand_max - 0.5, self.rand_max - 0.5)
+        ).to(memory_format=torch.channels_last)
+        return tensor
+    def reference(self, ps, A, B, C, alpha, beta, activation):
+        if self.conv_kind == ConvKind.Fprop:
+            torch_result = alpha * torch.ops.aten.conv2d(
+                A,
+                B,
+                stride=(ps.stride_h, ps.stride_w),
+                padding=(ps.pad_h, ps.pad_w),
+                dilation=(ps.dilation_h, ps.dilation_w)
+            ) + beta * C
+        elif self.conv_kind == ConvKind.Dgrad:
+            torch_result = alpha * torch.nn.grad.conv2d_input(
+                (ps.N, ps.C, ps.H, ps.W),
+                B,
+                A,
+                padding=(ps.pad_h, ps.pad_w),
+                stride=(ps.stride_h, ps.stride_w)
+            ) + beta * C
+        elif self.conv_kind == ConvKind.Wgrad:
+            torch_result = alpha * torch.nn.grad.conv2d_weight(
+                B,
+                (ps.K, ps.C, ps.R, ps.S),
+                A,
+                padding=(ps.pad_h, ps.pad_w),
+                stride=(ps.stride_h, ps.stride_w)
+            ) + beta * C
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
+        if activation == cutlass_cppgen.backend.epilogue.relu:
+            torch_result = torch.nn.functional.relu(torch_result)
+        elif activation == cutlass_cppgen.backend.epilogue.leaky_relu:
+            torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
+        return torch_result
+    def run(self, ps, split_k_mode=SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
+        if self.conv_kind == ConvKind.Fprop:
+            tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+            tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+            tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+        elif self.conv_kind == ConvKind.Dgrad:
+            tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+            tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+            tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+        elif self.conv_kind == ConvKind.Wgrad:
+            tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+            tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+            tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is not supported")
+        torch.manual_seed(self.seed)
+        tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
+        tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
+        tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
+        tensor_D = torch.zeros_like(tensor_C).to(memory_format=torch.channels_last)
+        args = self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
+            stride=(ps.stride_h, ps.stride_w),
+            padding=(ps.pad_h, ps.pad_w),
+            dilation=(ps.dilation_h, ps.dilation_w),
+            alpha=alpha, beta=beta,
+            split_k=(split_k_mode, split_k_slices))
+        args.sync()
+        tensor_D_ref = self.reference(ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation)
+        torch.cuda.synchronize()
+        passed = torch.allclose(tensor_D, tensor_D_ref, atol=2e-06)
+        return passed
+def add_test(
+    cls,
+    cc,
+    conv_kind,
+    problem_sizes,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm=None,
+    swizzle=None,
+    split_k_mode="serial",
+    split_k_slices=1,
+    activation = "identity"
+):
+    """Create a test-running function with the given specification"""
+    test_name = get_name_conv2d(
+        cc, conv_kind, element, element_accumulator,
+        element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
+        iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
+    def run(self):
+        # Create the plan
+        plan = cutlass_cppgen.Conv2d(
+            kind=conv_kind,
+            element=element,
+            element_accumulator=element_accumulator,
+            element_C=element_output,
+            element_D=element_output
+        )
+        # Set the opclass
+        plan.opclass = opclass
+        # Set the tile description
+        td = {
+            "threadblock_shape": threadblock_shape,
+            "warp_count": warp_count,
+            "stages": stages,
+            "instruction_shape": instruction_shape,
+        }
+        plan.tile_description = td
+        # Set iterator algorithm
+        if iterator_algorithm is not None:
+            plan.iterator_algorithm = iterator_algorithm
+        # Set swizzling functor
+        if swizzle is not None:
+            plan.swizzling_stride = swizzle
+        if activation != "identity":
+            if activation == "leaky_relu":
+                plan.activation = (cutlass_cppgen.epilogue.leaky_relu, 0.5)
+            else:
+                plan.activation = getattr(cutlass_cppgen.epilogue, activation)
+        conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="torch")
+        for ps in problem_sizes:
+            if not validate_problem_size(ps, conv_kind, split_k_slices):
+                continue
+            self.assertTrue(conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 2.0))
+    setattr(cls, test_name, run)
+    return run
+def get_conv_problems():
+    # 64: minimum channel size
+    conv_problems = TestbedConv2dProblemSizes(64).all
+    # Insert alignment 4 & 2 tests
+    conv_problems += [
+        Conv2DProblemSize(
+            1, 4, 4, 12,
+            8, 3, 3, 12,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 4, 4, 14,
+            8, 3, 3, 14,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 23, 56, 98,
+            128, 3, 3, 98,
+            4, 5,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+    ]
+    return conv_problems

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/run_all_tests.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+import pathlib
+import unittest
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'conv2d_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/emit/pytorch.py ADDED Viewed

	@@ -0,0 +1,309 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+"""
+Tests emitting a CUTLASS kernel to a PyTorch CUDA extension
+"""
+import random
+import tempfile
+import unittest
+from cutlass_library import ConvMode
+import cutlass_cppgen
+if cutlass_cppgen.utils.datatypes.is_torch_available():
+    import torch
+def _initialize(dtype, M: int, N: int, K: int):
+    """
+    Utility function to initialize A, B, C, and D matrices corresponding to dimensions M, N, and K
+    :param dtype: data type of tensors
+    :param M: M dimension of GEMM problem
+    :type M: int
+    :param N: N dimension of GEMM problem
+    :type N: int
+    :param K: N dimension of GEMM problem
+    :type K: int
+    :return: initialized tensors A, B, C, and D
+    :rtype: list
+    """
+    sizes = [(M, K), (K, N), (M, N), (M, N)]
+    return [torch.randint(-3, 3, size, device='cuda').to(dtype) for size in sizes]
+def _generate_problems(dtype, num):
+    """
+    Utility function to generate `num` GEMMs of random sizes
+    :param dtype: data type of tensors
+    :param num: number of GEMMs to generate
+    :type num: int
+    :return: lists of A, B, C, and D tensors
+    :rtype: list
+    """
+    valid_sizes = [128, 256, 512, 1024]
+    As, Bs, Cs, Ds = [], [], [], []
+    for _ in range(num):
+        M, N, K = [random.choice(valid_sizes) for _ in range(3)]
+        A, B, C, D = _initialize(dtype, M, N, K)
+        As.append(A)
+        Bs.append(B)
+        Cs.append(C)
+        Ds.append(D)
+    return As, Bs, Cs, Ds
+def _generate_conv2d_problem(conv_kind, dtype, ps):
+    """
+    Utility function to generate conv2d inputs
+    :param conv_kind: kind of convolution
+    :type conv_kind: str
+    :param dtype: data type of tensors
+    :param problem_size: the conv2d problem size
+    :type problem_size: cutlass_cppgen.shape.Conv2DProblemSize
+    :return: initialized tensors A, B, C, and D
+    :rtype: list
+    """
+    if conv_kind == "fprop":
+        tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+        tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+        tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+    elif conv_kind == "dgrad":
+        tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+        tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+        tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+    else:
+        tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+        tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+        tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+    sizes = [tensor_A_size, tensor_B_size, tensor_C_size]
+    return [torch.ceil(torch.empty(size, dtype=dtype, device='cuda').uniform_(-4.5, 3.5)).to(memory_format=torch.channels_last) for size in sizes]
+@unittest.skipIf(not cutlass_cppgen.utils.datatypes.is_torch_available(), 'PyTorch must be available to run PyTorch extension tests')
+class PyTorchExtensionTest(unittest.TestCase):
+    def test_gemm(self):
+        random.seed(2023)
+        dtype = torch.float16
+        plan = cutlass_cppgen.op.Gemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor)
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass_cppgen.emit.pytorch(op, name='gemm_mod', cc=plan.cc, sourcedir=tmpdir, jit=True)
+        A, B, C, _ = _initialize(dtype, 1024, 256, 512)
+        D_ref = A @ B
+        D = mod.run(A, B)
+        assert torch.allclose(D, D_ref)
+        D = mod.run(A, B, C)
+        assert torch.allclose(D, D_ref)
+        D = mod.run(A, B, C, 1.0)
+        assert torch.allclose(D, D_ref)
+        D = mod.run(A, B, C, 1.0, 0.0)
+        assert torch.allclose(D, D_ref)
+        alpha = 2.0
+        beta = -1.0
+        D_ref = (A @ B) * alpha + (beta * C)
+        D = mod.run(A, B, C, alpha, beta)
+        assert torch.allclose(D, D_ref)
+    def test_grouped_gemm(self):
+        random.seed(2023)
+        dtype = torch.float16
+        plan = cutlass_cppgen.op.GroupedGemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor)
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass_cppgen.emit.pytorch(op, name='grouped_gemm_mod', cc=plan.cc, sourcedir=tmpdir, jit=True)
+        As, Bs, Cs, _ = _generate_problems(dtype, 50)
+        def check_all(X, Y):
+            for x, y in zip(X, Y):
+                assert torch.allclose(x, y)
+        Ds_ref = [a @ b for a, b in zip(As, Bs)]
+        Ds = mod.run(As, Bs)
+        check_all(Ds, Ds_ref)
+        Ds = mod.run(As, Bs, Cs)
+        check_all(Ds, Ds_ref)
+        Ds = mod.run(As, Bs, Cs, 1.0)
+        check_all(Ds, Ds_ref)
+        Ds = mod.run(As, Bs, Cs, 1.0, 0.0)
+        check_all(Ds, Ds_ref)
+        alpha = 2.0
+        beta = -1.0
+        Ds_ref = [(a @ b) * alpha + (beta * c) for a, b, c in zip(As, Bs, Cs)]
+        Ds = mod.run(As, Bs, Cs, alpha, beta)
+        check_all(Ds, Ds_ref)
+    def test_conv2d_fprop(self):
+        torch.manual_seed(2023)
+        dtype = torch.float16
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=dtype, element_accumulator=torch.float32)
+        plan.activation = "relu"
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass_cppgen.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+        problem_size = cutlass_cppgen.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1
+        )
+        A, B, C = _generate_conv2d_problem("fprop", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+        alpha = 1.0
+        beta = 0.5
+        D_ref = alpha * torch.ops.aten.conv2d(
+            A, B, stride=stride, padding=padding
+        ) + beta * C
+        D_ref = torch.nn.functional.relu(D_ref)
+        D = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta)
+        assert torch.allclose(D, D_ref)
+        # Test serial split-K
+        D_serial_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
+        assert torch.allclose(D, D_serial_split_k)
+        # Test parallel split-K
+        D_parallel_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
+        assert torch.allclose(D, D_parallel_split_k)
+    def test_conv2d_dgrad(self):
+        torch.manual_seed(2023)
+        dtype = torch.float16
+        plan = cutlass_cppgen.op.Conv2d(kind="dgrad", element=dtype, element_accumulator=torch.float32)
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass_cppgen.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+        problem_size = cutlass_cppgen.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        )
+        A, B, C = _generate_conv2d_problem("dgrad", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+        alpha = 1.0
+        beta = 0.5
+        input_size = (problem_size.N, problem_size.C, problem_size.H, problem_size.W)
+        D_ref = alpha * torch.nn.grad.conv2d_input(
+            input_size, B, A,
+            stride=stride, padding=padding
+        ) + beta * C
+        D = mod.run(input_size, A, B, C, stride, padding, alpha=alpha, beta=beta, )
+        assert torch.allclose(D, D_ref)
+    def test_conv2d_wgrad(self):
+        torch.manual_seed(2023)
+        dtype = torch.float16
+        plan = cutlass_cppgen.op.Conv2d(kind="wgrad", element=dtype, element_accumulator=torch.float32)
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass_cppgen.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+        problem_size = cutlass_cppgen.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        )
+        A, B, C = _generate_conv2d_problem("wgrad", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+        alpha = 1.0
+        beta = 0.5
+        weight_size = (problem_size.K, problem_size.C, problem_size.R, problem_size.S)
+        D_ref = alpha * torch.nn.grad.conv2d_weight(
+            B, weight_size, A,
+            stride=stride, padding=padding
+        ) + beta * C
+        D = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta)
+        assert torch.allclose(D, D_ref)
+        # Test serial split-K
+        D_serial_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
+        assert torch.allclose(D, D_serial_split_k)
+        # Test parallel split-K
+        D_parallel_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
+        assert torch.allclose(D, D_parallel_split_k)
+if __name__ == '__main__':
+    unittest.main()

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_compute_sm80_90.py ADDED Viewed

	@@ -0,0 +1,198 @@

+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unit test for compute node in SM90
+"""
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+from cutlass_cppgen import swizzle
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+cutlass_cppgen.set_log_level(logging.WARNING)
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTCompute(EVTTestCaseBase):
+    def test_arith(self):
+        """
+        Test Arithmatic op
+        """
+        def evt_arith_compute(accum, C, alpha, beta, gamma):
+            D = ((accum + C) * alpha - gamma) / beta
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "gamma": 2.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+            launcher = EVTTestBed(self.element, evt_arith_compute, example_inputs)
+            input_keys = ["C", "alpha", "beta", "gamma"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_func_call(self):
+        """
+        Test Function call
+        """
+        def evt_func_call(accum, C, alpha, beta, gamma):
+            D = multiply_add(relu(accum + alpha) + C, beta, gamma)
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "gamma": 2.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+            launcher = EVTTestBed(self.element, evt_func_call, example_inputs)
+            input_keys = ["C", "alpha", "beta", "gamma"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_func_call2(self):
+        """
+        Test Function call
+        """
+        def evt_func_call2(accum, C, alpha, beta):
+            D = maximum(alpha * accum + beta * C, 0.0)
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+            launcher = EVTTestBed(self.element, evt_func_call2, example_inputs)
+            input_keys = ["C", "alpha", "beta"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_tanh(self):
+        """
+        Test Tanh op
+        """
+        def evt_tanh(accum):
+            D = tanh(accum)
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+            launcher = EVTTestBed(self.element, evt_tanh, example_inputs)
+            input_keys = []
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_sigmoid(self):
+        """
+        Test Sigmoid op
+        """
+        def evt_sigmoid(accum):
+            D = sigmoid(accum)
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+            launcher = EVTTestBed(self.element, evt_sigmoid, example_inputs)
+            input_keys = []
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_gelu(self):
+        """
+        Test GELU op
+        """
+        def evt_gelu(accum):
+            D = gelu(accum)
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+            launcher = EVTTestBed(self.element, evt_gelu, example_inputs)
+            input_keys = []
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_exp(self):
+        """
+        Test Exp op
+        """
+        def evt_exp(accum):
+            D = exp(accum)
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+            launcher = EVTTestBed(self.element, evt_exp, example_inputs)
+            input_keys = []
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+if __name__ == '__main__':
+    unittest.main()

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_layout_sm80_90.py ADDED Viewed

	@@ -0,0 +1,173 @@

+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unit test for store nodes in SM90
+"""
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+cutlass_cppgen.set_log_level(logging.WARNING)
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTLayout(EVTTestCaseBase):
+    def test_permute_1(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D_permute = F_permute + permute(C, indices=(0, 2, 1))
+            D = permute(D_permute, indices=(0, 2, 1))
+            return D, F
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    @unittest.skipIf(device_cc() != 90, "This unittest is for cc = Sm90 only")
+    def test_permute_2(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D = F_permute + C
+            return D, F
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, n, m)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, n, m)),
+            }
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    @unittest.skipIf(device_cc() != 90, "This unittest is for cc = Sm90 only")
+    def test_permute_3(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(1, 0, 2))
+            D = F_permute + C
+            return D, F
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (m, l, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (m, l, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_reshape(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            E_reshape = reshape(TensorE, new_shape=(512, 1))
+            D = F + E_reshape
+            return D
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (16, 32)),
+            "D": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+        }
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+    def test_reshape2(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            F_reshape = reshape(F, new_shape=(2, 3, 512, 256))
+            D = F_reshape + TensorE
+            return D
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (2, 3, 1, self.n)),
+            "D": self.fake_tensor(self.element, (2, 3, self.m, self.n)),
+        }
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+if __name__ == '__main__':
+    unittest.main()

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_load_sm80_90.py ADDED Viewed

	@@ -0,0 +1,142 @@

+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unit test for load nodes in SM90
+"""
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+cutlass_cppgen.set_log_level(logging.WARNING)
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTLoad(EVTTestCaseBase):
+    def test_tensor_load(self):
+        """
+        Load extra tensor with shape [m, n]
+        """
+        def evt_tensor_load(accum, C, aux, aux_batch):
+            D = accum + C + aux + aux_batch
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "aux_batch": self.fake_tensor(np.float32, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_tensor_load, example_inputs)
+            input_keys = ["C", "aux", "aux_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_row_broadcast(self):
+        """
+        Load extra tensor with shape [1, n]
+        """
+        def evt_row_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (n,)),
+                "bias_batch": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_row_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_column_broadcast(self):
+        """
+        Load extra tensor with shape [m, 1]
+        """
+        def evt_column_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (m, 1)),
+                "bias_batch": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_column_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_scalar_broadcast(self):
+        """
+        Load extra tensor with shape [1, 1]
+        """
+        def evt_scalar_broadcast(accum, C, alpha, alpha_batch):
+            D = accum + C + alpha + alpha_batch
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "alpha_batch": self.fake_tensor(np.float32, (l, 1, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_scalar_broadcast, example_inputs)
+            input_keys = ["C", "alpha", "alpha_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+if __name__ == '__main__':
+    unittest.main()

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_mixed_sm80_90.py ADDED Viewed

	@@ -0,0 +1,319 @@

+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unittest for mixed types of nodes in SM90
+"""
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+from cutlass_cppgen.swizzle import ThreadblockSwizzleStreamK
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+cutlass_cppgen.set_log_level(logging.WARNING)
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTMixed(EVTTestCaseBase):
+    def test_same_variable_used_multiple_times(self):
+        """
+        The same variable z0 is used multiple times
+        """
+        def evt_aux_store(accum):
+            z0 = relu(accum)
+            D = z0 + z0
+            return z0, D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "z0": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
+            input_keys = ["accum"]
+            result_keys = ["z0", "D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_no_lca(self):
+        """
+        The same variable z0 is used multiple times
+        """
+        def evt_no_lca(accum, bias):
+            E = relu(accum)
+            F = E + bias
+            tmp_2 = E + 2
+            D = tmp_2 + E
+            return D
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (m,1), stride=(1,0)),
+            }
+            launcher = EVTTestBed(self.element, evt_no_lca, example_inputs)
+            input_keys = ["accum", "bias"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_mixed_dag(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+        if device_cc() == 80:
+            alignments = [2, 4, 8]
+        else:
+            # Sm90 EVT currently only supports 128-bit alignment
+            alignments = [8,]
+        for align in alignments:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(self.element, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(self.element, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(self.element, (l, m, n)),
+                    "cbias": self.fake_tensor(self.element, (m, 1)),
+                    "rbias": self.fake_tensor(self.element, (n,)),
+                    "D": self.fake_tensor(self.element, (l, m, n)),
+                    "F": self.fake_tensor(self.element, (l, m, n)),
+                    "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                    "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                }
+                launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_float(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+        for align in [3, 2, 4]:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(np.float32, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(np.float32, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(np.float32, (l, m, n)),
+                    "cbias": self.fake_tensor(np.float32, (m, 1)),
+                    "rbias": self.fake_tensor(np.float32, (n,)),
+                    "D": self.fake_tensor(np.float32, (l, m, n)),
+                    "F": self.fake_tensor(np.float32, (l, m, n)),
+                    "F_row_max": self.fake_tensor(np.float32, (n,)),
+                    "E_col_max": self.fake_tensor(np.float32, (m, 1))
+                }
+                launcher = EVTTestBed(DataType.f32, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_stage2(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_partition_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+            tile_description = {
+                "threadblock_shape": [128, 128, 64],
+                "warp_count": [2, 2, 2]
+            }
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, tile_description=tile_description, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_stream_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+        # High per-sm occupancy tile_description
+        tile_description = {
+            "threadblock_shape": [128, 128, 32],
+            "warp_count": [2, 2, 1],
+            "stages": 3
+        }
+        tds = [None, tile_description]
+        for td in tds:
+            for m, n, k, l in self.get_problem_sizes(8, k=960, batch_count=[1, 3]):
+                if l == 1:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (m, n)),
+                        "F": self.fake_tensor(self.element, (m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+                else:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (l, m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (l, m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (l, m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (l, m, n)),
+                        "F": self.fake_tensor(self.element, (l, m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+                if td is not None:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        tile_description=td,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+                else:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_mixed_dag_no_batch(self):
+        def evt_mixed_dag_no_batch(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+        for m, n, k, _ in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (m, n)),
+                "F": self.fake_tensor(self.element, (m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+            launcher = EVTTestBed(self.element, evt_mixed_dag_no_batch, example_inputs)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, 1)
+if __name__ == '__main__':
+    unittest.main()

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_store_sm80_90.py ADDED Viewed

	@@ -0,0 +1,180 @@

+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unit test for store nodes in SM90
+"""
+import logging
+import unittest
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+cutlass_cppgen.set_log_level(logging.WARNING)
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTStore(EVTTestCaseBase):
+    @unittest.skipIf(device_cc() != 90, "This test is only for CC 90")
+    def test_invalid_store(self):
+        """
+        Test invalid store
+        """
+        def evt_invalid_store(accum):
+            D = accum
+            F = D + 1 # D has users, which is not allowed on SM90 or higher
+            return D, F
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n))
+            }
+            with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"On SM90 or higher, D is expected to be a output node with 0 users "
+                    r"to enable smem reuse between C and D, but got 1"
+                ):
+                launcher = EVTTestBed(self.element, evt_invalid_store, example_inputs)
+            break  # Only need to test once
+    def test_aux_store(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_aux_store(accum, alpha, C):
+            F = alpha * accum
+            D = F + C
+            return D, F
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_col_reduce(self):
+        """
+        Reduction [m, n] -> [m, 1]
+        """
+        def evt_row_reduce(accum, alpha, C):
+            acc_row_max = max(accum, dim=[2,])
+            F = alpha * accum
+            F_row_max = max(F, dim=[0, 2])
+            D = F + C
+            return D, F_row_max, acc_row_max
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(np.float32, (m, 1)),
+                "acc_row_max": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_row_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_row_max", "acc_row_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_row_reduce(self):
+        """
+        Reduction [m, n] -> [n]
+        """
+        def evt_col_reduce(accum, alpha, C):
+            acc_col_max = max(accum, dim=[1,])
+            F = alpha * accum
+            F_col_max = max(F, dim=[0, 1])
+            D = F + C
+            return D, F_col_max, acc_col_max
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_col_max": self.fake_tensor(np.float32, (n,)),
+                "acc_col_max": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_col_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_col_max", "acc_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    def test_scalar_reduce(self):
+        """
+        Reduction [m, n] -> [1,]
+        """
+        def evt_scalar_reduce(accum, alpha, C):
+            acc_max = max(accum, dim=[1, 2])
+            F = alpha * accum
+            F_max = max(F, dim=[0, 1, 2])
+            D = F + C
+            return D, F_max, acc_max
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "acc_max": self.fake_tensor(np.float32, (l, 1, 1)),
+                "F_max": self.fake_tensor(np.float32, (1,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+            launcher = EVTTestBed(self.element, evt_scalar_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_max", "acc_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+if __name__ == '__main__':
+    unittest.main()

build/torch212-cxx11-cu132-x86_64-linux/include/third-party/cutlass/test/python/cutlass/evt/run_all_tests.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+import pathlib
+import unittest
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'evt_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')