danieldk HF Staff commited on 2 days ago

Commit

3b163b3

verified ·

1 Parent(s): 71f0dbc

Build uploaded using `kernels` (batch 2/10).

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/op.py +34 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/ast_helpers.py +616 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/ast_preprocessor.py +1958 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/cache_helpers.py +153 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/common.py +268 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/compiler.py +288 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/dsl.py +1686 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/env_manager.py +320 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/jit_executor.py +357 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/__init__.py +25 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/cuda.py +476 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/device_tensor.py +121 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/dlpack_types.py +76 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py +188 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py +201 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/typing.py +1962 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/__init__.py +19 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/logger.py +81 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/stacktrace.py +165 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/timer.py +56 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/__init__.py +59 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/__init__.py +319 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/__init__.py +101 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/elect.py +84 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/mbar.py +349 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py +681 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/smem.py +108 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/tmem.py +142 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/core.py +0 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/math.py +445 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/__init__.py +26 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/common.py +189 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py +39 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py +471 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py +341 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/helpers.py +249 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py +62 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py +663 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py +328 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py +1041 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py +25 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py +189 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py +83 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py +29 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py +109 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py +405 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/runtime.py +510 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/testing.py +610 -0
build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/typing.py +207 -0

.gitattributes CHANGED Viewed

@@ -12,3 +12,4 @@ build/torch29-cxx11-cu128-x86_64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lf
 build/torch29-cxx11-cu129-x86_64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu130-x86_64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch210-cxx11-cu126-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text

 build/torch29-cxx11-cu129-x86_64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch29-cxx11-cu130-x86_64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
 build/torch210-cxx11-cu126-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text
+build/torch210-cxx11-cu128-aarch64-linux/_deep_gemm_cuda_a68a39f.abi3.so filter=lfs diff=lfs merge=lfs -text

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/op.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides MLIR's OP helper functions
+"""
+import inspect
+from functools import wraps
+from ..._mlir import ir
+def dsl_user_op(opFunc):
+    @wraps(opFunc)
+    def wrapper(*args, **kwargs):
+        loc = kwargs.pop("loc", None)
+        if loc is None:
+            frame = inspect.currentframe().f_back
+            file_loc = ir.Location.file(frame.f_code.co_filename, frame.f_lineno, 0)
+            loc = ir.Location.name(frame.f_code.co_name, childLoc=file_loc)
+        res_or_list = opFunc(*args, **kwargs, loc=loc)
+        return res_or_list
+    return wrapper

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/ast_helpers.py ADDED Viewed

	@@ -0,0 +1,616 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides helper functions that are generated by the preprocessor.
+The preprocessor read through python's ast and changes the input code.
+"""
+from typing import Callable, Iterator, Optional, overload
+from typing_extensions import deprecated
+import warnings
+import inspect
+from types import BuiltinFunctionType
+from functools import lru_cache
+from inspect import getmembers
+from .utils.logger import log
+from .common import *
+from ._mlir_helpers.arith import ArithValue
+class Executor:
+    """
+    The Executor class handles dynamic and compile-time (constexpr) execution
+    of "for" loops and "if-else-elif" statements.
+    Methods:
+        set_functions:  Assigns the functions for checking loop bounds and
+                        conditional evaluation.
+        for_execute: Generates MLIR for OP
+        while_execute: Generates MLIR while OP
+        if_execute: generate MLIR if OP
+    """
+    def __init__(self):
+        self._is_dynamic_expression = None
+        self._loop_execute_range_dynamic = None
+        self._if_dynamic = None
+        self._while_dynamic = None
+        self._compare_executor = None
+        self._any_executor = None
+        self._all_executor = None
+        self._builtin_redirector = None
+    def set_functions(
+        self,
+        *,
+        is_dynamic_expression: Callable,
+        loop_execute_range_dynamic: Callable,
+        if_dynamic: Callable,
+        while_dynamic: Callable,
+        compare_executor: Callable,
+        any_executor: Callable = None,
+        all_executor: Callable = None,
+        builtin_redirector: Callable = None,
+    ):
+        self._is_dynamic_expression = is_dynamic_expression
+        self._loop_execute_range_dynamic = loop_execute_range_dynamic
+        self._if_dynamic = if_dynamic
+        self._while_dynamic = while_dynamic
+        self._compare_executor = compare_executor
+        self._any_executor = any_executor
+        self._all_executor = all_executor
+        self._builtin_redirector = builtin_redirector
+    @staticmethod
+    def convert_to_list(x):
+        """This function is used to convert x to a list.
+        If x is None, return an empty list.
+        If x is not a list, return a list containing x.
+        Otherwise, return x itself.
+        """
+        if x is None:
+            return []
+        if not isinstance(x, list):
+            return [x]
+        return x
+    @staticmethod
+    def converge_ret_val(res):
+        """This function is used to converge res (the return value) of the function.
+        If res is None, return None.
+        If res is a list and has only one element, return the element.
+        Otherwise, return res itself.
+        """
+        if res is None:
+            return res
+        elif isinstance(res, list) and len(res) == 1:
+            return res[0]
+        return res
+    def for_execute(
+        self,
+        func,
+        start,
+        stop,
+        step,
+        write_args=[],
+        full_write_args_count=0,
+        write_args_names=[],
+        unroll=-1,
+        unroll_full=False,
+        prefetch_stages=None,
+    ):
+        assert (
+            self._loop_execute_range_dynamic
+        ), "Functions must be set before execution."
+        log().debug("start [%s] stop [%s] step [%s]", start, stop, step)
+        return self._loop_execute_range_dynamic(
+            func,
+            start,
+            stop,
+            step,
+            write_args,
+            full_write_args_count,
+            write_args_names,
+            unroll,
+            unroll_full,
+            prefetch_stages,
+        )
+    def if_execute(
+        self,
+        pred,
+        then_block: Callable,
+        else_block: Optional[Callable] = None,
+        write_args=[],
+        full_write_args_count=0,
+        write_args_names=[],
+    ):
+        assert self._if_dynamic, "Functions must be set before execution."
+        # MLIR generation
+        return self._if_dynamic(
+            pred,
+            then_block,
+            else_block,
+            write_args,
+            full_write_args_count,
+            write_args_names,
+        )
+    def while_execute(
+        self,
+        pred,
+        while_before_block: Callable,
+        while_after_block: Callable,
+        write_args=[],
+        full_write_args_count=0,
+        write_args_names=[],
+    ):
+        assert self._while_dynamic, "Functions must be set before execution."
+        # MLIR generation
+        return self._while_dynamic(
+            while_before_block,
+            while_after_block,
+            write_args,
+            full_write_args_count,
+            write_args_names,
+        )
+# =============================================================================
+# Decorator
+# =============================================================================
+executor = Executor()
+def loop_selector(
+    start,
+    stop,
+    step,
+    *,
+    write_args=[],
+    full_write_args_count=0,
+    write_args_names=[],
+    unroll=-1,
+    unroll_full=False,
+    prefetch_stages=None,
+):
+    log().debug(
+        "start [%s] stop [%s] step [%s] write_args [%s] full_write_args_count [%s] write_args_names [%s] unroll [%s] unroll_full [%s] prefetch_stages [%s]",
+        start,
+        stop,
+        step,
+        write_args,
+        full_write_args_count,
+        write_args_names,
+        unroll,
+        unroll_full,
+        prefetch_stages,
+    )
+    from .typing import Integer, Numeric
+    def _maybe_upcast(value):
+        if isinstance(value, Integer):
+            value = value.ir_value()
+        return value
+    start = _maybe_upcast(start)
+    stop = _maybe_upcast(stop)
+    step = _maybe_upcast(step)
+    def ir_loop(func):
+        return executor.for_execute(
+            func,
+            start,
+            stop,
+            step,
+            write_args,
+            full_write_args_count,
+            write_args_names,
+            unroll,
+            unroll_full,
+            prefetch_stages,
+        )
+    return ir_loop
+def if_selector(pred, write_args=[]):
+    log().debug("pred [%s] write_args [%s]", pred, write_args)
+    # Handle Numeric types here?
+    from .typing import Numeric
+    if isinstance(pred, Numeric):
+        pred = pred.value
+    def ir_loop(func):
+        return func(pred, *write_args)
+    return ir_loop
+def while_selector(pred, write_args=[]):
+    def ir_while_loop(func):
+        return func(pred, *write_args)
+    return ir_while_loop
+def while_executor(
+    pred,
+    while_before_block: Callable,
+    while_after_block: Callable,
+    write_args=[],
+    full_write_args_count=0,
+    write_args_names=[],
+):
+    return executor.while_execute(
+        pred,
+        while_before_block,
+        while_after_block,
+        write_args,
+        full_write_args_count,
+        write_args_names,
+    )
+def if_executor(
+    pred,
+    then_block: Callable,
+    else_block: Optional[Callable] = None,
+    write_args=[],
+    full_write_args_count=0,
+    write_args_names=[],
+):
+    return executor.if_execute(
+        pred,
+        then_block,
+        else_block,
+        write_args,
+        full_write_args_count,
+        write_args_names,
+    )
+# =============================================================================
+# Range
+# =============================================================================
+class range:
+    """
+    A range-like object for dynamic loop iteration in the DSL.
+    This class provides a range interface similar to Python's built-in range,
+    but is designed to be preprocessed into constructs for dynamic
+    loop execution.
+    The class supports both single-argument (stop) and three-argument
+    (start, stop, step) constructors with additional parameters for loop
+    optimization:
+    - unroll: Number of iterations to unroll (0 or 1 = no unrolling)
+    - unroll_full: Whether to fully unroll the loop
+    - prefetch_stages: Number of prefetch stages to generate
+    """
+    @overload
+    def __new__(cls, stop, unroll=0, unroll_full=False, prefetch_stages=None):
+        pass
+    @overload
+    def __new__(
+        cls, start, stop, step, unroll=0, unroll_full=False, prefetch_stages=None
+    ):
+        pass
+    def __new__(cls, *args, **kwargs):
+        raise DSLRuntimeError("dynamic range should be always preprocessed to IR")
+    def __iter__(self) -> Iterator[int]:
+        raise DSLRuntimeError("dynamic range should be always preprocessed to IR")
+@deprecated(
+    "range_dynamic is deprecated and will be removed in the future, please remove it."
+)
+def range_dynamic(*args, **kwargs):
+    raise DSLRuntimeError("range_dynamic should be always preprocessed to IR")
+def range_constexpr(*args):
+    raise DSLRuntimeError("range_constexpr should be preprocessed by preprocessor.")
+# =============================================================================
+# If expressions
+# =============================================================================
+def const_expr(expression):
+    """
+    This function is used to check if the expression is a python value.
+    If the expression is a python value, return the boolean value of the expression.
+    If the expression is a dynamic expression, raise an error.
+    """
+    from .typing import Numeric
+    failed = False
+    if isinstance(expression, Numeric):
+        if isinstance(expression.value, (int, float, bool)):
+            return expression.value
+        else:
+            failed = True
+    elif executor._is_dynamic_expression(expression):
+        failed = True
+    if failed:
+        raise DSLRuntimeError(
+            f"The function `const_expr({expression})` received a dynamic expression (non compile-time constant).",
+            context={
+                "If your expression depends on dynamic values": "Remove `const_expr()`",
+            },
+        )
+    return expression
+@deprecated(
+    "dynamic_expr is deprecated and will be removed in the future, please remove it."
+)
+def dynamic_expr(expression):
+    return expression
+# =============================================================================
+# Assertion & casting
+# =============================================================================
+def assert_executor(test, msg=None):
+    from .typing import Numeric
+    fail = False
+    # Implicit convert dynamic expression to bool is not allowed
+    # So here explicitly do a None check
+    if test is not None and executor._is_dynamic_expression(test):
+        if isinstance(test, Numeric):
+            try:
+                test = test.to(bool)
+            except:
+                fail = True
+        else:
+            fail = True
+    if not fail:
+        assert test, msg
+    else:
+        raise DSLRuntimeError(
+            "Only constexpr (Python Value) is allowed here, but got non-constexpr (IR Values) expression.",
+            suggestion="Please replace with runtime assert.",
+        )
+def bool_cast(value):
+    if executor._is_dynamic_expression(value):
+        raise DSLRuntimeError(
+            "Only constexpr (Python Value) is allowed here, but got non-constexpr (IR Values) expression.",
+            suggestion="Please explicitly convert to boolean with expressions like comparision.",
+        )
+    return bool(value)
+def compare_executor(left, comparators, ops):
+    """
+    Executes comparison operations with a left operand and a list of comparators.
+    Args:
+        left: The leftmost value in the comparison chain
+        comparators: A list of values to compare against
+        ops: A list of comparison operators to apply
+    Returns:
+        The result of the comparison chain
+    Raises:
+        AssertionError: If the executor function is not set before execution
+    """
+    assert (
+        executor._compare_executor is not None
+    ), "Function must be set before execution."
+    return executor._compare_executor(left, comparators, ops)
+def any_executor(iterable):
+    """Executes the 'any' operation on an iterable, handling both dynamic and static expressions.
+    :param iterable: An iterable to check if any elements evaluate to True
+    :type iterable: Iterable
+    :return: boolean of Python value or IR value
+    :rtype: bool or cutlass.Boolean
+    """
+    if executor._any_executor and executor._is_dynamic_expression(iterable):
+        return executor._any_executor(iterable)
+    else:
+        return any(iterable)
+def all_executor(iterable):
+    """Executes the 'all' operation on an iterable, handling both dynamic and static expressions.
+    :param iterable: An iterable to check if all elements evaluate to True
+    :type iterable: Iterable
+    :return: boolean of Python value or IR value
+    :rtype: bool or cutlass.Boolean
+    """
+    if executor._all_executor and executor._is_dynamic_expression(iterable):
+        return executor._all_executor(iterable)
+    else:
+        return all(iterable)
+# =============================================================================
+# Control flow checks
+# =============================================================================
+class DSLOptimizationWarning(Warning):
+    """
+    This warning is used to warn the user about the optimization related issues in DSL.
+    """
+    def __init__(self, message):
+        self.message = message
+        super().__init__()
+    def __str__(self):
+        return self.message
+def range_value_check(*args):
+    """
+    Ensure all `range_constexpr` bounds are compile-time constants (Python ints).
+    """
+    try:
+        args = tuple(arg.__index__() for arg in args)
+        # Compute range size and warn if it's too large
+        start = 0
+        end = 0
+        step = 1
+        if len(args) == 1:
+            end = args[0]
+        elif len(args) == 2:
+            start = args[0]
+            end = args[1]
+        elif len(args) == 3:
+            start = args[0]
+            end = args[1]
+            step = args[2]
+        range_length = (abs(end - start) - 1) // abs(step) + 1
+        if range_length >= 64:
+            warnings.warn(
+                f"This static loop has {range_length} iterations, which may be very slow to compile, consider using `cutlass.range(..., unroll_full=True)` instead.",
+                category=DSLOptimizationWarning,
+                stacklevel=2,
+            )
+        return (start, end, step)
+    except:
+        raise DSLRuntimeError(
+            "`range_constexpr` requires constexpr (compile-time constant) for all arguments.",
+            suggestion="Use `range` instead of `range_constexpr`.",
+        )
+def range_perf_warning(filename, lineno, *args):
+    has_dynamic_expr = False
+    for arg in args:
+        if executor._is_dynamic_expression(arg):
+            has_dynamic_expr = True
+            break
+    if not has_dynamic_expr:
+        warnings.warn_explicit(
+            (
+                "This loop is no longer unrolled and may cause performance regression. "
+                "Use `range(..., unroll_full=True)` for full unrolling, or switch to `range_constexpr` when bounds are compile-time constants."
+            ),
+            category=DSLOptimizationWarning,
+            filename=filename,
+            lineno=lineno,
+        )
+@lru_cache(maxsize=1)
+def _get_self_module():
+    """
+    This function is used to get the owning module of this function.
+    """
+    return inspect.getmodule(_get_self_module)
+def cf_symbol_check(symbol):
+    """
+    Check if the symbol is control flow symbol from current module.
+    """
+    failed = False
+    name = symbol.__name__
+    self_module = _get_self_module()
+    if inspect.ismodule(symbol):
+        name = "range"
+        if not self_module.__name__.startswith(symbol.__name__):
+            failed = True
+    else:
+        owning_module = inspect.getmodule(symbol)
+        if owning_module != self_module:
+            failed = True
+    if failed:
+        raise DSLRuntimeError(
+            f"Incorrect {symbol.__name__} is used.",
+            suggestion=f"Please avoid overriding `{symbol.__name__}` from DSL package.",
+        )
+def redirect_builtin_function(fcn):
+    """
+    This function is used to redirect built-in function call
+    to the function defined in DSL package.
+    """
+    # Only redirect if it's a built-in
+    if isinstance(fcn, BuiltinFunctionType) and executor._builtin_redirector:
+        return executor._builtin_redirector(fcn)
+    return fcn
+def copy_members(dest, src):
+    """
+    Copies all non-callable, non-dunder members from src to dest if they exist in src.
+    Skips members that are callables or have names starting with double underscores.
+    """
+    if id(dest) == id(src):
+        return
+    members = getmembers(dest)
+    for name, value in members:
+        if (
+            name.startswith("__")
+            or isinstance(value, Callable)
+            or not hasattr(src, name)
+        ):
+            continue
+        setattr(dest, name, getattr(src, name))
+def get_locals_or_none(locals, symbols):
+    """
+    Given a locals() dictionary and a list of symbol names, return a list of their values
+    in the same order as the symbols list. If a symbol is not present in locals, None is returned
+    for that symbol.
+    """
+    variables = []
+    for symbol in symbols:
+        if symbol in locals:
+            variables.append(locals[symbol])
+        else:
+            variables.append(None)
+    return variables

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/ast_preprocessor.py ADDED Viewed

	@@ -0,0 +1,1958 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module defines the `DSLPreprocessor` class, which acts as a Python preprocessor.
+It uses Python's AST and rewrites specific Python statements such as `for` and `if-else`.
+The preprocessor operates on the following constructs:
+    - `for` loops:
+        - Rewrites `for` loops with the `@loop_selector` decorator.
+        - Supports `range`, `range_dynamic` for loop iteration.
+    - `if-elif-else` statements:
+        - Rewrites conditional statements with the `@if_selector` decorator.
+        - Supports `dynamic_expr` and `const_expr` in the condition expressions.
+Additionally, both `for` loops and `if-else` statements require `yield`
+operation generation. The preprocessor handles this by:
+    - Using a `ScopeManager` to track symbols across different scopes during AST traversal.
+    - Identifying read-only, read-write, and active variables for DSL constructs.
+    - Generating `yield` operations for symbols that are classified as read-write or write.
+It is designed to be generic and can handle `for` and `if` constructs from other dialects.
+In such cases, the user's DSL should implement `@loop_selector` and `@if_selector`
+to generate dialect-specific operations for `for` and `if` statements.
+"""
+import ast
+import importlib
+import inspect
+import textwrap
+import warnings
+from dataclasses import dataclass
+from typing import List, Set, Dict, Any, Callable, Optional
+from types import ModuleType
+from collections import OrderedDict
+from copy import deepcopy
+from .common import *
+from .utils.logger import log
+class OrderedSet:
+    """
+    A deterministic set implementation for ordered operations.
+    """
+    def __init__(self, iterable=None):
+        self._dict = dict.fromkeys(iterable or [])
+    def add(self, item):
+        self._dict[item] = None
+    def __iter__(self):
+        return iter(self._dict)
+    def __and__(self, other):
+        return OrderedSet(key for key in self._dict if key in other)
+    def __or__(self, other):
+        new_dict = self._dict.copy()
+        new_dict.update(dict.fromkeys(other))
+        return OrderedSet(new_dict)
+    def __sub__(self, other):
+        return OrderedSet(key for key in self._dict if key not in other)
+    def intersections(self, others):
+        """Compute the intersection of this set with multiple other sets.
+        :param others: A list of sets to compute intersections with
+        :type others: List[Set[str]]
+        :return: A new ordered set containing elements that appear in this set
+            and at least one of the other sets
+        """
+        result = OrderedSet()
+        for key in self._dict:
+            for other in reversed(others):
+                if key in other:
+                    result.add(key)
+                    break
+        return result
+@dataclass
+class ImportInfo:
+    """
+    Information about an import expression.
+    """
+    module_path: str
+    attr_name: Optional[str]
+    alias_name: str
+@dataclass
+class ScopeManager:
+    """
+    Manages symbol scopes during AST traversal.
+    Manage nested scopes during transformations.
+    """
+    scopes: List[Set[str]]
+    @classmethod
+    def create(cls) -> "ScopeManager":
+        return cls([])
+    def add_to_scope(self, name: str) -> None:
+        if name == "_":
+            return
+        self.scopes[-1].add(name)
+    def get_active_symbols(self) -> List[Set[str]]:
+        return self.scopes.copy()
+    def __enter__(self) -> "ScopeManager":
+        self.scopes.append(set())
+        return self
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.scopes.pop()
+class DSLPreprocessor(ast.NodeTransformer):
+    """
+    A preprocessor for transforming Python ASTs. It supports:
+    - Rewriting `for` loops with the `@loop_selector` decorator.
+    - Rewriting `if-elif-else` statements with the `@if_selector` decorator.
+    - Generating `yield` operations for read-write or write symbols.
+    """
+    DECORATOR_FOR_STATEMENT = "loop_selector"
+    DECORATOR_IF_STATEMENT = "if_selector"
+    DECORATOR_WHILE_STATEMENT = "while_selector"
+    IF_EXECUTOR = "if_executor"
+    WHILE_EXECUTOR = "while_executor"
+    ASSERT_EXECUTOR = "assert_executor"
+    BOOL_CAST = "bool_cast"
+    IMPLICIT_DOWNCAST_NUMERIC_TYPE = "implicitDowncastNumericType"
+    SUPPORTED_FOR_RANGE_STATEMENTS = {"range", "range_dynamic", "range_constexpr"}
+    COMPARE_EXECUTOR = "compare_executor"
+    ANY_EXECUTOR = "any_executor"
+    ALL_EXECUTOR = "all_executor"
+    def __init__(self, client_module_name):
+        super().__init__()
+        self.counter = 0  # Unique function names for multiple loops
+        self.scope_manager = ScopeManager.create()
+        self.processed_functions = set()
+        self.function_counter = 0
+        self.function_name = "<unknown function>"
+        self.class_name = None
+        self.file_name = "<unknown filename>"
+        self.function_depth = 0
+        self.local_closures = set()
+        self.function_globals = None
+        self.client_module_name = client_module_name
+        self.import_top_module = False
+    def _create_module_attribute(
+        self,
+        func_name,
+        *,
+        top_module_name="_dsl_",
+        submodule_name="ast_helpers",
+        lineno=None,
+        col_offset=None,
+    ):
+        # If we simply copy location from origin node, it contains a way to wide range, which cause location in traceback to be wrong.
+        def set_location(node, lineno, col_offset):
+            if lineno and col_offset:
+                node.lineno = lineno
+                node.end_lineno = lineno
+                node.col_offset = col_offset
+                node.end_col_offset = col_offset
+        base = ast.Name(id=top_module_name, ctx=ast.Load())
+        set_location(base, lineno, col_offset)
+        if submodule_name:
+            base = ast.Attribute(value=base, attr=submodule_name, ctx=ast.Load())
+            set_location(base, lineno, col_offset)
+        node = ast.Attribute(value=base, attr=func_name, ctx=ast.Load())
+        set_location(node, lineno, col_offset)
+        return node
+    def _get_module_imports(self, decorated_func):
+        """Extract imports from the module containing the decorated function"""
+        imports = []
+        # Get the module containing the decorated function
+        if module := inspect.getmodule(decorated_func):
+            try:
+                # Get the module source code
+                source = inspect.getsource(module)
+                module_ast = ast.parse(source)
+                # Extract imports from the full module
+                alias = lambda n: n.asname if n.asname else n.name
+                for node in ast.walk(module_ast):
+                    if isinstance(node, ast.Import):
+                        for name in node.names:
+                            imports.append(
+                                ImportInfo(
+                                    module_path=name.name,
+                                    attr_name=None,
+                                    alias_name=alias(name),
+                                )
+                            )
+                    elif isinstance(node, ast.ImportFrom):
+                        module_name = node.module
+                        if node.level > 0:
+                            # Handle relative imports
+                            package_name = module.__package__.rsplit(
+                                ".", node.level - 1
+                            )[0]
+                            module_name = f"{package_name}.{module_name}"
+                        for name in node.names:
+                            imports.append(
+                                ImportInfo(
+                                    module_path=module_name,
+                                    attr_name=name.name,
+                                    alias_name=alias(name),
+                                )
+                            )
+            except (IOError, TypeError):
+                pass
+        return imports
+    def exec(self, function_name, original_function, code_object, exec_globals):
+        # Get imports from the original module
+        module_imports = self._get_module_imports(original_function)
+        # Import all required modules
+        for import_info in module_imports:
+            module_path, attr_name, alias_name = (
+                import_info.module_path,
+                import_info.attr_name,
+                import_info.alias_name,
+            )
+            try:
+                module = importlib.import_module(module_path)
+                if attr_name:
+                    if attr_name == "*":
+                        if hasattr(module, "__all__"):
+                            attrs = module.__all__
+                        else:
+                            attrs = [
+                                name for name in dir(module) if not name.startswith("_")
+                            ]
+                    else:
+                        attrs = [attr_name]
+                    for attr in attrs:
+                        alias = attr if attr_name == "*" else alias_name
+                        exec_globals[alias] = getattr(module, attr)
+                else:
+                    exec_globals[alias_name] = module
+            except (ImportError, AttributeError) as e:
+                raise ImportError(f"Failed to import {module_path}: {str(e)}")
+        # Execute the transformed code
+        log().info(
+            "ASTPreprocessor Executing transformed code for function [%s]",
+            function_name,
+        )
+        exec(code_object, exec_globals)
+        return exec_globals.get(function_name)
+    @staticmethod
+    def print_ast(transformed_tree=None):
+        print("#", "-" * 40, "Transformed AST", "-" * 40)
+        unparsed_code = ast.unparse(transformed_tree)
+        print(unparsed_code)
+        print("#", "-" * 40, "End Transformed AST", "-" * 40)
+    def make_func_param_name(self, base_name, used_names):
+        """Generate a unique parameter name that doesn't collide with existing names."""
+        if base_name not in used_names:
+            return base_name
+        i = 0
+        while f"{base_name}_{i}" in used_names:
+            i += 1
+        return f"{base_name}_{i}"
+    def transform_function(self, func_name, function_pointer):
+        """
+        Transforms a function.
+        """
+        # Skip if the function has already been processed
+        if function_pointer in self.processed_functions:
+            log().info(
+                "ASTPreprocessor Skipping already processed function [%s]", func_name
+            )
+            return []
+        # Step 1. Parse the given function
+        file_name = inspect.getsourcefile(function_pointer)
+        lines, start_line = inspect.getsourcelines(function_pointer)
+        dedented_source = textwrap.dedent("".join(lines))
+        tree = ast.parse(dedented_source, filename=file_name)
+        # Bump the line numbers so they match the real source file
+        ast.increment_lineno(tree, start_line - 1)
+        # Step 1.2 Check the decorator
+        if not self.check_decorator(tree.body[0]):
+            log().info(
+                "[%s] - Skipping function due to missing decorator",
+                func_name,
+            )
+            return []
+        self.processed_functions.add(function_pointer)
+        log().info("ASTPreprocessor Transforming function [%s]", func_name)
+        # Step 2. Transform the function
+        transformed_tree = self.visit(tree)
+        # Step 3. Import cutlass and base_dsl
+        top_module_name = ".".join(self.client_module_name)
+        import_stmts = []
+        if self.import_top_module:
+            import_stmts.append(ast.Import(names=[ast.alias(name=top_module_name)]))
+        import_stmts.append(
+            ast.Import(
+                names=[ast.alias(name=f"{top_module_name}.base_dsl", asname="_dsl_")]
+            )
+        )
+        transformed_tree.body = import_stmts + transformed_tree.body
+        # Step 4. Import cutlass and base_dsl
+        ast.fix_missing_locations(transformed_tree)
+        combined_body = transformed_tree.body
+        # Step 5. Return the transformed tree
+        return combined_body
+    def check_early_exit(self, tree, kind):
+        """
+        Checks if a given region or scope in the provided Python code has early exits.
+        """
+        class EarlyExitChecker(ast.NodeVisitor):
+            def __init__(self, kind):
+                self.has_early_exit = False
+                self.early_exit_node = None
+                self.early_exit_type = None
+                self.kind = kind
+                self.loop_nest_level = 0
+            # Early exit is not allowed in any level of dynamic control flow
+            def visit_Return(self, node):
+                self.has_early_exit = True
+                self.early_exit_node = node
+                self.early_exit_type = "return"
+            def visit_Raise(self, node):
+                self.has_early_exit = True
+                self.early_exit_node = node
+                self.early_exit_type = "raise"
+            def visit_Break(self, node):
+                # For break/continue in inner loops, we don't consider it as early exit
+                if self.loop_nest_level == 0 and self.kind != "if":
+                    self.has_early_exit = True
+                    self.early_exit_node = node
+                    self.early_exit_type = "break"
+            def visit_Continue(self, node):
+                if self.loop_nest_level == 0 and self.kind != "if":
+                    self.has_early_exit = True
+                    self.early_exit_node = node
+                    self.early_exit_type = "continue"
+            def visit_For(self, node):
+                self.loop_nest_level += 1
+                self.generic_visit(node)
+                self.loop_nest_level -= 1
+            def visit_While(self, node):
+                self.loop_nest_level += 1
+                self.generic_visit(node)
+                self.loop_nest_level -= 1
+        checker = EarlyExitChecker(kind)
+        checker.generic_visit(tree)
+        if not checker.has_early_exit:
+            return
+        raise DSLAstPreprocessorError(
+            message=f"Early exit ({checker.early_exit_type}) is not allowed in `{self.function_name}`"
+            + (f" in `{self.class_name}`" if self.class_name else ""),
+            filename=self.file_name,
+            snippet=ast.unparse(tree),
+            suggestion=(
+                "If predicates are constant expression, write like "
+                "`if const_expr(...)` or `for ... in range_constexpr(...)`. "
+                "In that case, early exit will be executed by Python "
+                "interpreter, so it's supported."
+            ),
+        )
+    def is_node_constexpr(self, node) -> bool:
+        """
+        Determines if the node is a constexpr.
+        Supported nodes are if, while statements.
+        """
+        if isinstance(node, ast.If) or isinstance(node, ast.While):
+            if isinstance(node.test, ast.Call):
+                func = node.test.func
+                if isinstance(func, ast.Attribute) and func.attr == "const_expr":
+                    return True
+                elif isinstance(func, ast.Name) and func.id == "const_expr":
+                    return True
+        return False
+    def _get_range_kind(self, iter_node):
+        """
+        Return "range", "range_dynamic", "range_constexpr" or None for the iterable
+        """
+        if isinstance(iter_node, ast.Call):
+            func = iter_node.func
+            if (
+                isinstance(func, ast.Name)
+                and func.id in self.SUPPORTED_FOR_RANGE_STATEMENTS
+            ):
+                return func.id, True, len(iter_node.keywords) != 0
+            if (
+                isinstance(func, ast.Attribute)
+                and func.attr in self.SUPPORTED_FOR_RANGE_STATEMENTS
+            ):
+                return func.attr, False, len(iter_node.keywords) != 0
+        return None, None, None
+    def transform(self, original_function, exec_globals):
+        """
+        Transforms the provided function using the preprocessor.
+        """
+        self.file_name = inspect.getsourcefile(original_function)
+        self.function_globals = exec_globals
+        transformed_tree = self.transform_function(
+            original_function.__name__, original_function
+        )
+        self.function_globals = None
+        unified_tree = ast.Module(body=transformed_tree, type_ignores=[])
+        unified_tree = ast.fix_missing_locations(unified_tree)
+        return unified_tree
+    def analyze_region_variables(
+        self, node: Union[ast.For, ast.If], active_symbols: List[Set[str]]
+    ):
+        """
+        Analyze variables in different code regions to identify read-only, write-only,
+        and active variables for DSL constructs.
+        """
+        # we need orderedset to keep the insertion order the same. otherwise generated IR is different each time
+        write_args = OrderedSet()
+        invoked_args = OrderedSet()
+        local_closure = self.local_closures
+        file_name = self.file_name
+        region_node = node
+        class RegionAnalyzer(ast.NodeVisitor):
+            force_store = False
+            def visit_Name(self, node):
+                """
+                Mark every store as write.
+                """
+                if isinstance(node.ctx, ast.Store) or self.force_store:
+                    write_args.add(node.id)
+            def visit_Subscript(self, node):
+                # When subscript occurs on the lhs of an assignment, the `Name` is still a load, but `Subscript` is marked as `Store`.
+                # We need to force the store for the `Name` to be marked as write.
+                if isinstance(node.ctx, ast.Store):
+                    self.force_store = True
+                    self.visit(node.value)
+                    self.force_store = False
+                    self.visit(node.slice)
+                else:
+                    self.generic_visit(node)
+            def visit_Assign(self, node):
+                self.force_store = True
+                [self.visit(target) for target in node.targets]
+                self.force_store = False
+                self.visit(node.value)
+            def visit_AugAssign(self, node):
+                self.force_store = True
+                self.visit(node.target)
+                self.force_store = False
+                self.visit(node.value)
+            @staticmethod
+            def get_call_base(func_node):
+                if isinstance(func_node, ast.Attribute):
+                    # If the .value is another Attribute, keep digging
+                    if isinstance(func_node.value, ast.Attribute):
+                        return RegionAnalyzer.get_call_base(func_node.value)
+                    # If the .value is a Name, that's our base
+                    elif isinstance(func_node.value, ast.Name):
+                        return func_node.value.id
+                    else:
+                        # Could be something else (lambda, call, etc.)
+                        return None
+                elif isinstance(func_node, ast.Name):
+                    return None
+                return None
+            @staticmethod
+            def get_function_name(func_node: ast.Call):
+                if isinstance(func_node.func, ast.Name):
+                    function_name = func_node.func.id
+                # Check if it's a method or attribute call
+                elif isinstance(func_node.func, ast.Attribute):
+                    function_name = func_node.func.attr
+                else:
+                    function_name = None
+                return function_name
+            def visit_Call(self, node):
+                base_name = RegionAnalyzer.get_call_base(node.func)
+                if isinstance(node.func, ast.Name):
+                    func_name = node.func.id
+                    if func_name in local_closure:
+                        raise DSLAstPreprocessorError(
+                            f"Function `{func_name}` is a closure and is not supported in for/if statements",
+                            filename=file_name,
+                            snippet=ast.unparse(region_node),
+                        )
+                # Classes are mutable by default. Mark them as write. If they are
+                # dataclass(frozen=True), treat them as read in runtime.
+                if base_name is not None and base_name not in ("self"):
+                    invoked_args.add(base_name)
+                self.generic_visit(node)
+        analyzer = RegionAnalyzer()
+        analyzer.visit(ast.Module(body=node))
+        # If arg is both write and invoke, remove from invoked_args
+        invoked_args = invoked_args - write_args
+        write_args = list(write_args.intersections(active_symbols))
+        invoked_args = list(invoked_args.intersections(active_symbols))
+        return write_args + invoked_args, len(write_args)
+    def extract_range_args(self, iter_node):
+        args = iter_node.args
+        if len(args) == 1:
+            return (
+                self.visit(ast.Constant(value=0)),
+                self.visit(args[0]),
+                self.visit(ast.Constant(value=1)),
+                False,
+            )
+        elif len(args) == 2:
+            return (
+                self.visit(args[0]),
+                self.visit(args[1]),
+                self.visit(ast.Constant(value=1)),
+                False,
+            )
+        elif len(args) == 3:
+            return self.visit(args[0]), self.visit(args[1]), self.visit(args[2]), True
+        else:
+            raise DSLAstPreprocessorError(
+                "Unsupported number of arguments in range", filename=self.file_name
+            )
+    def extract_unroll_args(self, iter_node):
+        keywords = {kw.arg: kw.value for kw in iter_node.keywords}
+        return (
+            keywords.get("unroll", ast.Constant(value=-1)),
+            keywords.get("unroll_full", ast.Constant(value=False)),
+        )
+    def issue_deprecation_warning(self, *, message, category, filename, lineno):
+        warnings.simplefilter("always", category)  # turn off filter
+        warnings.warn_explicit(
+            message, category=category, filename=filename, lineno=lineno
+        )
+        warnings.simplefilter("default", category)  # reset filter
+    def extract_prefetch_stages_args(self, iter_node):
+        keywords = {kw.arg: kw.value for kw in iter_node.keywords}
+        if "pipelining" in keywords:
+            self.issue_deprecation_warning(
+                message="pipelining is deprecated, use prefetch_stages instead",
+                category=DeprecationWarning,
+                filename=self.file_name,
+                lineno=iter_node.lineno,
+            )
+            return keywords.get("pipelining", ast.Constant(value=None))
+        return keywords.get("prefetch_stages", ast.Constant(value=None))
+    def create_loop_function(
+        self,
+        func_name,
+        node,
+        start,
+        stop,
+        step,
+        unroll,
+        unroll_full,
+        prefetch_stages,
+        write_args,
+        full_write_args_count,
+    ):
+        """
+        Creates a loop body function with the `loop_selector` decorator.
+        """
+        func_args = [ast.arg(arg=node.target.id, annotation=None)]
+        func_args += [ast.arg(arg=var, annotation=None) for var in write_args]
+        # Create the loop body
+        transformed_body = []
+        for stmt in node.body:
+            transformed_stmt = self.visit(stmt)  # Recursively visit inner statements
+            if isinstance(transformed_stmt, list):
+                transformed_body.extend(transformed_stmt)
+            else:
+                transformed_body.append(transformed_stmt)
+        # Handle the return for a single iterated argument correctly
+        if len(write_args) == 0:
+            transformed_body.append(ast.Return())
+        else:
+            transformed_body.append(
+                ast.Return(
+                    value=ast.List(
+                        elts=[ast.Name(id=var, ctx=ast.Load()) for var in write_args],
+                        ctx=ast.Load(),
+                    )
+                )
+            )
+        # Define the decorator with parameters
+        decorator = ast.copy_location(
+            ast.Call(
+                func=self._create_module_attribute(
+                    self.DECORATOR_FOR_STATEMENT,
+                    lineno=node.lineno,
+                    col_offset=node.col_offset,
+                ),
+                args=[start, stop, step],
+                keywords=[
+                    ast.keyword(arg="unroll", value=unroll),
+                    ast.keyword(arg="unroll_full", value=unroll_full),
+                    ast.keyword(arg="prefetch_stages", value=prefetch_stages),
+                    ast.keyword(
+                        arg="write_args",
+                        value=self.generate_get_locals_or_none_call(write_args),
+                    ),
+                    ast.keyword(
+                        arg="full_write_args_count",
+                        value=ast.Constant(value=full_write_args_count),
+                    ),
+                    ast.keyword(
+                        arg="write_args_names",
+                        value=ast.List(
+                            elts=[ast.Constant(value=arg) for arg in write_args],
+                            ctx=ast.Load(),
+                        ),
+                    ),
+                ],
+            ),
+            node,
+        )
+        return ast.copy_location(
+            ast.FunctionDef(
+                name=func_name,
+                args=ast.arguments(
+                    posonlyargs=[],
+                    args=func_args,
+                    kwonlyargs=[],
+                    kw_defaults=[],
+                    defaults=[],
+                ),
+                body=transformed_body,
+                decorator_list=[decorator],
+            ),
+            node,
+        )
+    def visit_BoolOp(self, node):
+        # Visit child nodes first
+        self.generic_visit(node)
+        # It is necessary to expand short circuit evaluation explicit here
+        # Although we do not support inline if-else for IR generation, this is actually evaluated in Python
+        # So it's fine here
+        # Transform "and" to "and_"
+        if isinstance(node.op, ast.And):
+            # Create an if-else statement in AST form
+            # if type(lhs) == bool and lhs == False:
+            #     return lhs
+            # else
+            #     return and_(lhs, rhs)
+            short_circuit_value = ast.Constant(value=False)
+            helper_func = self._create_module_attribute(
+                "and_",
+                top_module_name="cutlass",
+                submodule_name=None,
+                lineno=node.lineno,
+                col_offset=node.col_offset,
+            )
+            self.import_top_module = True
+        # Transform "or" to "or_"
+        elif isinstance(node.op, ast.Or):
+            # Create an if-else statement in AST form
+            # if type(lhs) == bool and lhs == True:
+            #     return lhs
+            # else
+            #     return or_(lhs, rhs)
+            short_circuit_value = ast.Constant(value=True)
+            helper_func = self._create_module_attribute(
+                "or_",
+                top_module_name="cutlass",
+                submodule_name=None,
+                lineno=node.lineno,
+                col_offset=node.col_offset,
+            )
+            self.import_top_module = True
+        else:
+            # BoolOp should be either And or Or
+            raise DSLAstPreprocessorError(
+                f"Unsupported boolean operation: {node.op}",
+                filename=self.file_name,
+                snippet=ast.unparse(node),
+            )
+        def short_circuit_eval(value, short_circuit_value):
+            return ast.BoolOp(
+                op=ast.And(),
+                values=[
+                    ast.Compare(
+                        left=ast.Call(
+                            func=ast.Name(id="type", ctx=ast.Load()),
+                            args=[value],
+                            keywords=[],
+                        ),
+                        ops=[ast.Eq()],
+                        comparators=[ast.Name(id="bool", ctx=ast.Load())],
+                    ),
+                    ast.Compare(
+                        left=value,
+                        ops=[ast.Eq()],
+                        comparators=[short_circuit_value],
+                    ),
+                ],
+            )
+        lhs = node.values[0]
+        for i in range(1, len(node.values)):
+            test = short_circuit_eval(lhs, short_circuit_value)
+            lhs = ast.IfExp(
+                test=test,
+                body=lhs,
+                orelse=ast.Call(
+                    func=helper_func,
+                    args=[lhs, node.values[i]],
+                    keywords=[],
+                ),
+            )
+        return ast.copy_location(lhs, node)
+    def visit_UnaryOp(self, node):
+        # Visit child nodes first
+        self.generic_visit(node)
+        # Transform "not" to "~" as we overload __invert__
+        if isinstance(node.op, ast.Not):
+            func_name = self._create_module_attribute(
+                "not_",
+                top_module_name="cutlass",
+                submodule_name=None,
+                lineno=node.lineno,
+                col_offset=node.col_offset,
+            )
+            self.import_top_module = True
+            return ast.copy_location(
+                ast.Call(func=func_name, args=[node.operand], keywords=[]), node
+            )
+        return node
+    def _insert_range_value_check(self, node):
+        """
+        Insert a check for range arguments
+        """
+        range_inputs = node.iter.args
+        check_call = ast.copy_location(
+            ast.Call(
+                func=self._create_module_attribute(
+                    "range_value_check", lineno=node.lineno, col_offset=node.col_offset
+                ),
+                args=range_inputs,
+                keywords=[],
+            ),
+            node.iter,
+        )
+        node.iter = ast.copy_location(
+            ast.Call(
+                func=ast.Name(id="range", ctx=ast.Load()),
+                args=[ast.Starred(value=check_call, ctx=ast.Load())],
+                keywords=[],
+            ),
+            node.iter,
+        )
+    def _insert_cf_symbol_check(self, func):
+        """
+        Insert a check for range symbol
+        """
+        check_call = ast.copy_location(
+            ast.Call(
+                func=self._create_module_attribute(
+                    "cf_symbol_check", lineno=func.lineno, col_offset=func.col_offset
+                ),
+                args=[deepcopy(func)],
+                keywords=[],
+            ),
+            func,
+        )
+        return ast.Expr(check_call)
+    def visit_For(self, node):
+        # For static for loop (for with range_constexpr or not range based for), preprocessor keeps the loop.
+        range_kind, is_builtin_range, has_keyword = self._get_range_kind(node.iter)
+        if range_kind == "range_constexpr" or range_kind == None:
+            self.generic_visit(node)
+            if range_kind == "range_constexpr":
+                check_call = self._insert_cf_symbol_check(node.iter.func)
+                # Rewrite range_constexpr to range
+                node.iter.func = ast.Name(id="range", ctx=ast.Load())
+                self._insert_range_value_check(node)
+                return [check_call, node]
+            return node
+        active_symbols = self.scope_manager.get_active_symbols()
+        with self.scope_manager:
+            if isinstance(node.target, ast.Name):
+                self.scope_manager.add_to_scope(node.target.id)
+            if range_kind == "range_dynamic":
+                # Generate a warning
+                self.issue_deprecation_warning(
+                    message="range_dynamic is deprecated and will be removed in the future, please remove it.",
+                    category=DeprecationWarning,
+                    filename=self.file_name,
+                    lineno=node.iter.lineno,
+                )
+            warning_call = None
+            if range_kind == "range" and is_builtin_range and not has_keyword:
+                # Warn about possible performance regression due to behavior change
+                warning_call = ast.Expr(
+                    ast.Call(
+                        func=self._create_module_attribute(
+                            "range_perf_warning",
+                            lineno=node.lineno,
+                            col_offset=node.col_offset,
+                        ),
+                        args=[
+                            ast.Constant(value=self.file_name),
+                            ast.Constant(value=node.iter.lineno),
+                        ]
+                        + node.iter.args,
+                        keywords=[],
+                    )
+                )
+                ast.copy_location(warning_call, node.iter)
+            is_prefixed_range = range_kind == "range" and not is_builtin_range
+            check_call = None
+            if range_kind == "range_dynamic" or is_prefixed_range:
+                # Insert a check for range symbol
+                if not is_prefixed_range:
+                    check_call = self._insert_cf_symbol_check(node.iter.func)
+                else:
+                    # Get toplevel module
+                    check_call = self._insert_cf_symbol_check(node.iter.func.value)
+            new_for_node = self.transform_for_loop(node, active_symbols)
+            if check_call is not None:
+                new_for_node = [check_call] + new_for_node
+        return new_for_node if warning_call is None else [warning_call] + new_for_node
+    @staticmethod
+    def _hoist_expr_to_assignments(expr, name):
+        return ast.copy_location(
+            ast.Assign(targets=[ast.Name(id=name, ctx=ast.Store())], value=expr), expr
+        )
+    def _build_select_and_assign(self, *, name, test, body, orelse, location):
+        node = ast.copy_location(
+            ast.Assign(
+                targets=[ast.Name(id=name, ctx=ast.Store())],
+                value=ast.IfExp(
+                    test=test,
+                    body=body,
+                    orelse=orelse,
+                ),
+            ),
+            location,
+        )
+        self.generic_visit(node)
+        return node
+    def _handle_negative_step(self, node, start_expr, stop_expr, step_expr):
+        # hoist start, stop, step to assignments
+        start_ori_name = f"start_ori_{self.counter}"
+        start = self._hoist_expr_to_assignments(start_expr, start_ori_name)
+        stop_ori_name = f"stop_ori_{self.counter}"
+        stop = self._hoist_expr_to_assignments(stop_expr, stop_ori_name)
+        step_ori_name = f"step_ori_{self.counter}"
+        step = self._hoist_expr_to_assignments(step_expr, step_ori_name)
+        extra_exprs = [start, stop, step]
+        # Handle possible negative step, generates the following code in Python:
+        # isNegative = step < 0
+        isNegative_name = f"isNegative_{self.counter}"
+        isNegative = ast.copy_location(
+            ast.Assign(
+                targets=[ast.Name(id=isNegative_name, ctx=ast.Store())],
+                value=ast.Compare(
+                    left=ast.Name(id=step_ori_name, ctx=ast.Load()),
+                    ops=[ast.Lt()],
+                    comparators=[ast.Constant(value=0)],
+                ),
+            ),
+            step,
+        )
+        # start = stop if isNegative else start
+        start_name = f"start_{self.counter}"
+        start = self._build_select_and_assign(
+            name=start_name,
+            test=ast.Name(id=isNegative_name, ctx=ast.Load()),
+            body=ast.Name(id=stop_ori_name, ctx=ast.Load()),
+            orelse=ast.Name(id=start_ori_name, ctx=ast.Load()),
+            location=start,
+        )
+        # stop = start if isNegative else stop
+        stop_name = f"stop_{self.counter}"
+        stop = self._build_select_and_assign(
+            name=stop_name,
+            test=ast.Name(id=isNegative_name, ctx=ast.Load()),
+            body=ast.Name(id=start_ori_name, ctx=ast.Load()),
+            orelse=ast.Name(id=stop_ori_name, ctx=ast.Load()),
+            location=stop,
+        )
+        # step = -step if isNegative else step
+        step_name = f"step_{self.counter}"
+        step = self._build_select_and_assign(
+            name=step_name,
+            test=ast.Name(id=isNegative_name, ctx=ast.Load()),
+            body=ast.UnaryOp(
+                op=ast.USub(), operand=ast.Name(id=step_ori_name, ctx=ast.Load())
+            ),
+            orelse=ast.Name(id=step_ori_name, ctx=ast.Load()),
+            location=step,
+        )
+        # offset = start + stop if isNegative else 0
+        offset_name = f"offset_{self.counter}"
+        offset = self._build_select_and_assign(
+            name=offset_name,
+            test=ast.Name(id=isNegative_name, ctx=ast.Load()),
+            body=ast.BinOp(
+                op=ast.Add(),
+                left=ast.Name(id=start_name, ctx=ast.Load()),
+                right=ast.Name(id=stop_name, ctx=ast.Load()),
+            ),
+            orelse=ast.Constant(value=0),
+            location=node,
+        )
+        extra_exprs.append(isNegative)
+        extra_exprs.append(start)
+        extra_exprs.append(stop)
+        extra_exprs.append(step)
+        extra_exprs.append(offset)
+        # Add this to begining of loop body
+        # for i in range(start, stop, step):
+        #     i = offset - i if isNegative else i
+        assert isinstance(node.target, ast.Name)
+        target_name = node.target.id
+        target = self._build_select_and_assign(
+            name=target_name,
+            test=ast.Name(id=isNegative_name, ctx=ast.Load()),
+            body=ast.BinOp(
+                op=ast.Sub(),
+                left=ast.Name(id=offset_name, ctx=ast.Load()),
+                right=ast.Name(id=target_name, ctx=ast.Load()),
+            ),
+            orelse=ast.Name(id=target_name, ctx=ast.Load()),
+            location=node.target,
+        )
+        node.body.insert(0, target)
+        return (
+            ast.Name(id=start_name, ctx=ast.Load()),
+            ast.Name(id=stop_name, ctx=ast.Load()),
+            ast.Name(id=step_name, ctx=ast.Load()),
+            extra_exprs,
+        )
+    def transform_for_loop(self, node, active_symbols):
+        # Check for early exit and raise exception
+        self.check_early_exit(node, "for")
+        if node.orelse:
+            raise DSLAstPreprocessorError(
+                "dynamic for loop with else is not supported",
+                filename=self.file_name,
+                snippet=ast.unparse(node),
+            )
+        # Get loop target variable name
+        target_var_name = None
+        target_var_is_active_before_loop = False
+        if isinstance(node.target, ast.Name):
+            target_var_name = node.target.id
+            for active_symbol in active_symbols:
+                if target_var_name in active_symbol:
+                    target_var_is_active_before_loop = True
+                    active_symbols.remove(active_symbol)
+                    break
+        # Add necessary exprs to handle this
+        if target_var_is_active_before_loop:
+            # Initialize an extra loop carried variable
+            loop_carried_var_name = f"loop_carried_var_{self.counter}"
+            pre_loop_expr = ast.copy_location(
+                ast.Assign(
+                    targets=[ast.Name(id=loop_carried_var_name, ctx=ast.Store())],
+                    value=ast.Name(id=target_var_name, ctx=ast.Load()),
+                ),
+                node,
+            )
+            # append an extra assignment to the loop carried variable
+            node.body.append(
+                ast.copy_location(
+                    ast.Assign(
+                        targets=[ast.Name(id=loop_carried_var_name, ctx=ast.Store())],
+                        value=ast.Name(id=target_var_name, ctx=ast.Load()),
+                    ),
+                    node,
+                )
+            )
+            active_symbols.append({loop_carried_var_name})
+        start_expr, stop_expr, step_expr, has_step = self.extract_range_args(node.iter)
+        unroll, unroll_full = self.extract_unroll_args(node.iter)
+        prefetch_stages = self.extract_prefetch_stages_args(node.iter)
+        write_args, full_write_args_count = self.analyze_region_variables(
+            node, active_symbols
+        )
+        if has_step and self.client_module_name[0] == "cutlass":
+            start, stop, step, exprs = self._handle_negative_step(
+                node, start_expr, stop_expr, step_expr
+            )
+        else:
+            start, stop, step, exprs = start_expr, stop_expr, step_expr, []
+        if target_var_is_active_before_loop:
+            exprs.append(pre_loop_expr)
+        func_name = f"loop_body_{self.counter}"
+        self.counter += 1
+        func_def = self.create_loop_function(
+            func_name,
+            node,
+            start,
+            stop,
+            step,
+            unroll,
+            unroll_full,
+            prefetch_stages,
+            write_args,
+            full_write_args_count,
+        )
+        assign = self.create_cf_call(func_name, write_args, node)
+        # This should work fine as it modifies the AST structure
+        exprs = exprs + [func_def] + assign
+        if target_var_is_active_before_loop:
+            # Create a new assignment to the target variable
+            exprs.append(
+                ast.copy_location(
+                    ast.Assign(
+                        targets=[ast.Name(id=target_var_name, ctx=ast.Store())],
+                        value=ast.Name(id=loop_carried_var_name, ctx=ast.Load()),
+                    ),
+                    node,
+                )
+            )
+        return exprs
+    def visit_Assert(self, node):
+        test = self.visit(node.test)
+        args = [ast.keyword(arg="test", value=test)]
+        if node.msg:
+            msg = self.visit(node.msg)
+            args.append(ast.keyword(arg="msg", value=msg))
+        # Rewrite to assert_executor(test, msg)
+        new_node = ast.Expr(
+            ast.Call(
+                func=self._create_module_attribute(
+                    self.ASSERT_EXECUTOR, lineno=node.lineno, col_offset=node.col_offset
+                ),
+                args=[],
+                keywords=args,
+            )
+        )
+        # Propagate line number from original node to new node
+        ast.copy_location(new_node, node)
+        return new_node
+    def visit_Call(self, node):
+        func = node.func
+        # Visit args and kwargs
+        node.args = [self.visit(arg) for arg in node.args]
+        node.keywords = [self.visit(kwarg) for kwarg in node.keywords]
+        # Rewrite call to some built-in functions
+        if isinstance(func, ast.Name):
+            # Check if the function is 'bool'
+            if func.id == "bool":
+                return ast.copy_location(
+                    ast.Call(
+                        func=self._create_module_attribute(
+                            self.BOOL_CAST,
+                            lineno=node.lineno,
+                            col_offset=node.col_offset,
+                        ),
+                        args=[node.args[0]],
+                        keywords=[],
+                    ),
+                    node,
+                )
+            elif func.id in ["any", "all"]:
+                helper_func = (
+                    self.ANY_EXECUTOR if func.id == "any" else self.ALL_EXECUTOR
+                )
+                return ast.copy_location(
+                    ast.Call(
+                        func=self._create_module_attribute(
+                            helper_func, lineno=node.lineno, col_offset=node.col_offset
+                        ),
+                        args=[node.args[0]],
+                        keywords=[],
+                    ),
+                    node,
+                )
+            elif func.id in ["min", "max"]:
+                return ast.copy_location(
+                    ast.Call(
+                        func=self._create_module_attribute(
+                            func.id,
+                            top_module_name="cutlass",
+                            submodule_name=None,
+                            lineno=node.lineno,
+                            col_offset=node.col_offset,
+                        ),
+                        args=[node.args[0], node.args[1]],
+                        keywords=[],
+                    ),
+                    node,
+                )
+        elif isinstance(func, ast.Attribute) and isinstance(func.value, ast.Name):
+            def create_downcast_call(arg):
+                return ast.copy_location(
+                    ast.Call(
+                        func=self._create_module_attribute(
+                            self.IMPLICIT_DOWNCAST_NUMERIC_TYPE,
+                            submodule_name="typing",
+                            lineno=node.lineno,
+                            col_offset=node.col_offset,
+                        ),
+                        args=[arg],
+                        keywords=[],
+                    ),
+                    arg,
+                )
+            module = self.function_globals.get(func.value.id)
+            if isinstance(module, ModuleType) and module.__package__.endswith(
+                "._mlir.dialects"
+            ):
+                # Check if argument is Numeric, if so, call ir_value()
+                args = []
+                for arg in node.args:
+                    args.append(create_downcast_call(arg))
+                kwargs = []
+                for kwarg in node.keywords:
+                    kwargs.append(
+                        ast.copy_location(
+                            ast.keyword(
+                                arg=kwarg.arg,
+                                value=create_downcast_call(kwarg.value),
+                            ),
+                            kwarg,
+                        )
+                    )
+                return ast.copy_location(
+                    ast.Call(func=func, args=args, keywords=kwargs), node
+                )
+        else:
+            node.func = self.visit(node.func)
+        return node
+    def visit_ClassDef(self, node):
+        self.class_name = node.name
+        self.generic_visit(node)
+        self.class_name = None
+        return node
+    def _visit_target(self, target):
+        if isinstance(target, ast.Name):
+            self.scope_manager.add_to_scope(target.id)
+        elif isinstance(target, ast.Tuple):
+            for t in target.elts:
+                if isinstance(t, ast.Name):
+                    self.scope_manager.add_to_scope(t.id)
+    def visit_Assign(self, node):
+        for target in node.targets:
+            self._visit_target(target)
+        self.generic_visit(node)
+        return node
+    def visit_AugAssign(self, node):
+        self._visit_target(node.target)
+        self.generic_visit(node)
+        return node
+    def visit_Name(self, node):
+        isLoad = isinstance(node.ctx, ast.Load)
+        if node.id in ["max", "min", "any", "all"] and isLoad:
+            return ast.copy_location(
+                ast.Call(
+                    func=self._create_module_attribute(
+                        "redirect_builtin_function",
+                        lineno=node.lineno,
+                        col_offset=node.col_offset,
+                    ),
+                    args=[node],
+                    keywords=[],
+                ),
+                node,
+            )
+        elif node.id == "_" and isLoad:
+            raise DSLAstPreprocessorError("Read '_' is not allowed")
+        else:
+            self.generic_visit(node)
+        return node
+    def check_decorator(self, node: ast.AST) -> bool:
+        """
+        Check if the function has the correct decorator for preprocessing.
+        """
+        if not isinstance(node, ast.FunctionDef):
+            return False
+        decorator_list = node.decorator_list
+        if len(decorator_list) == 0:
+            return False
+        for d in decorator_list:
+            if isinstance(d, ast.Call):
+                if isinstance(d.func, ast.Attribute):
+                    if d.func.attr in ["jit", "kernel"]:
+                        if d.keywords == []:
+                            return True
+                        for keyword in d.keywords:
+                            if keyword.arg == "preprocess":
+                                try:
+                                    if isinstance(keyword.value, ast.Constant):
+                                        return keyword.value.value
+                                    else:
+                                        return ast.literal_eval(keyword.value)
+                                except:
+                                    pass
+            elif isinstance(d, ast.Attribute):
+                if d.attr in ["jit", "kernel"]:
+                    return True
+        return False
+    def remove_dsl_decorator(self, decorator_list):
+        """
+        Remove .jit and .kernel decorators
+        The decorator can be in two forms:
+        - @jit(...)
+        - @jit
+        """
+        new_decorator_list = []
+        decorator_names = ["jit", "kernel"]
+        for d in decorator_list:
+            is_jit_or_kernel = False
+            if isinstance(d, ast.Call):
+                if isinstance(d.func, ast.Attribute):
+                    if d.func.attr in decorator_names:
+                        is_jit_or_kernel = True
+            elif isinstance(d, ast.Attribute):
+                if d.attr in decorator_names:
+                    is_jit_or_kernel = True
+            if not is_jit_or_kernel:
+                new_decorator_list.append(d)
+        return new_decorator_list
+    def visit_FunctionDef(self, node):
+        with self.scope_manager:
+            self.function_counter += 1
+            self.function_name = node.name
+            if self.function_depth > 0:
+                self.local_closures.add(node.name)
+            self.function_depth += 1
+            # Add function name and arguments
+            self.scope_manager.add_to_scope(node.name)
+            for arg in node.args.args:
+                self.scope_manager.add_to_scope(arg.arg)
+            self.generic_visit(node)
+        self.function_depth -= 1
+        # Remove .jit and .kernel decorators
+        node.decorator_list = self.remove_dsl_decorator(node.decorator_list)
+        return node
+    def visit_With(self, node):
+        with self.scope_manager:
+            for item in node.items:
+                if isinstance(item.optional_vars, ast.Name):
+                    self.scope_manager.add_to_scope(item.optional_vars.id)
+            self.generic_visit(node)
+        return node
+    def visit_While(self, node):
+        # Constexpr doesn't get preprocessed
+        if self.is_node_constexpr(node):
+            self.generic_visit(node)
+            check = self._insert_cf_symbol_check(node.test.func)
+            return [check, node]
+        active_symbols = self.scope_manager.get_active_symbols()
+        with self.scope_manager:
+            # Check for early exit and raise exception
+            self.check_early_exit(node, "while")
+            write_args, full_write_args_count = self.analyze_region_variables(
+                node, active_symbols
+            )
+            func_name = f"while_region_{self.counter}"
+            self.counter += 1
+            func_def = self.create_while_function(
+                func_name, node, write_args, full_write_args_count
+            )
+            assign = self.create_cf_call(func_name, write_args, node)
+        return [func_def] + assign
+    def visit_Try(self, node):
+        with self.scope_manager:
+            self.generic_visit(node)
+        return node
+    def visit_ExceptHandler(self, node):
+        with self.scope_manager:
+            if node.name:  # Exception variable
+                self.scope_manager.add_to_scope(node.name)
+            self.generic_visit(node)
+        return node
+    def create_cf_call(self, func_name, yield_args, node):
+        """Creates the assignment statement for the if function call"""
+        if not yield_args:
+            return [
+                ast.copy_location(
+                    ast.Expr(value=ast.Name(id=func_name, ctx=ast.Load())), node
+                )
+            ]
+        has_self = False
+        for i, arg in enumerate(yield_args):
+            if arg == "self":
+                has_self = True
+                yield_args[i] = "yield_self"
+                break
+        if len(yield_args) == 1:
+            assign = ast.Assign(
+                targets=[ast.Name(id=yield_args[0], ctx=ast.Store())],
+                value=ast.Name(id=func_name, ctx=ast.Load()),
+            )
+        else:
+            assign = ast.Assign(
+                targets=[
+                    ast.Tuple(
+                        elts=[ast.Name(id=var, ctx=ast.Store()) for var in yield_args],
+                        ctx=ast.Store(),
+                    )
+                ],
+                value=ast.Name(id=func_name, ctx=ast.Load()),
+            )
+        if has_self:
+            fix_self = ast.Expr(
+                value=ast.Call(
+                    func=self._create_module_attribute(
+                        "copy_members", lineno=node.lineno, col_offset=node.col_offset
+                    ),
+                    args=[
+                        ast.Name(id="self", ctx=ast.Load()),
+                        ast.Name(id="yield_self", ctx=ast.Load()),
+                    ],
+                    keywords=[],
+                )
+            )
+            return [ast.copy_location(assign, node), ast.copy_location(fix_self, node)]
+        else:
+            return [ast.copy_location(assign, node)]
+    def visit_IfExp(self, node):
+        """
+        Visits an inline if-else expression (ternary operator).
+        This is the Python equivalent of `x if condition else y`.
+        """
+        self.generic_visit(node)
+        # Emit
+        # node if type(pred) == bool else select_(pred, body, orelse)
+        # so if pred is a python bool, use python to short-circuit and avoid emit arith.select
+        self.import_top_module = True
+        return ast.copy_location(
+            ast.IfExp(
+                test=ast.Compare(
+                    left=ast.Call(
+                        func=ast.Name(id="type", ctx=ast.Load()),
+                        args=[node.test],
+                        keywords=[],
+                    ),
+                    ops=[ast.Eq()],
+                    comparators=[ast.Name(id="bool", ctx=ast.Load())],
+                ),
+                body=node,  # Original ternary expression
+                orelse=ast.Call(
+                    func=self._create_module_attribute(
+                        "select_", top_module_name="cutlass", submodule_name=None
+                    ),
+                    args=[
+                        node.test,
+                        node.body,
+                        node.orelse,
+                    ],
+                    keywords=[],
+                ),
+            ),
+            node,
+        )
+    cmpops = {
+        "Eq": "==",
+        "NotEq": "!=",
+        "Lt": "<",
+        "LtE": "<=",
+        "Gt": ">",
+        "GtE": ">=",
+        "Is": "is",
+        "IsNot": "is not",
+        "In": "in",
+        "NotIn": "not in",
+    }
+    def compare_ops_to_str(self, node):
+        names = [
+            ast.Constant(value=self.cmpops[op.__class__.__name__]) for op in node.ops
+        ]
+        return ast.List(elts=names, ctx=ast.Load())
+    def visit_Compare(self, node):
+        self.generic_visit(node)
+        comparator_strs = self.compare_ops_to_str(node)
+        keywords = [
+            ast.keyword(arg="left", value=node.left),
+            ast.keyword(
+                arg="comparators", value=ast.List(elts=node.comparators, ctx=ast.Load())
+            ),
+            ast.keyword(arg="ops", value=comparator_strs),
+        ]
+        call = ast.copy_location(
+            ast.Call(
+                func=self._create_module_attribute(self.COMPARE_EXECUTOR),
+                args=[],
+                keywords=keywords,
+            ),
+            node,
+        )
+        return call
+    def visit_If(self, node):
+        # const_expr doesn't get preprocessed
+        if self.is_node_constexpr(node):
+            self.generic_visit(node)
+            check = self._insert_cf_symbol_check(node.test.func)
+            return [check, node]
+        active_symbols = self.scope_manager.get_active_symbols()
+        with self.scope_manager:
+            # Check for early exit and raise exception
+            self.check_early_exit(node, "if")
+            yield_args, full_write_args_count = self.analyze_region_variables(
+                node, active_symbols
+            )
+            func_name = f"if_region_{self.counter}"
+            self.counter += 1
+            func_def = self.create_if_function(
+                func_name, node, yield_args, full_write_args_count
+            )
+            assign = self.create_cf_call(func_name, yield_args, node)
+        return [func_def] + assign
+    def generate_get_locals_or_none_call(self, write_args):
+        return ast.Call(
+            func=self._create_module_attribute("get_locals_or_none"),
+            args=[
+                ast.Call(
+                    func=ast.Name(id="locals", ctx=ast.Load()), args=[], keywords=[]
+                ),
+                ast.List(
+                    elts=[ast.Constant(value=arg) for arg in write_args],
+                    ctx=ast.Load(),
+                ),
+            ],
+            keywords=[],
+        )
+    def create_if_function(self, func_name, node, write_args, full_write_args_count):
+        test_expr = self.visit(node.test)
+        pred_name = self.make_func_param_name("pred", write_args)
+        func_args = [ast.arg(arg=pred_name, annotation=None)]
+        func_args += [ast.arg(arg=var, annotation=None) for var in write_args]
+        func_args_then_else = [ast.arg(arg=var, annotation=None) for var in write_args]
+        then_body = []
+        for stmt in node.body:
+            transformed_stmt = self.visit(stmt)  # Recursively visit inner statements
+            if isinstance(transformed_stmt, list):
+                then_body.extend(transformed_stmt)
+            else:
+                then_body.append(transformed_stmt)
+        # Create common return list for all blocks
+        return_list = ast.List(
+            elts=[ast.Name(id=var, ctx=ast.Load()) for var in write_args],
+            ctx=ast.Load(),
+        )
+        # Create common function arguments
+        func_decorator_arguments = ast.arguments(
+            posonlyargs=[], args=func_args, kwonlyargs=[], kw_defaults=[], defaults=[]
+        )
+        func_then_else_arguments = ast.arguments(
+            posonlyargs=[],
+            args=func_args_then_else,
+            kwonlyargs=[],
+            kw_defaults=[],
+            defaults=[],
+        )
+        then_block_name = f"then_block_{self.counter}"
+        else_block_name = f"else_block_{self.counter}"
+        elif_region_name = f"elif_region_{self.counter}"
+        self.counter += 1
+        # Create then block
+        then_block = ast.copy_location(
+            ast.FunctionDef(
+                name=then_block_name,
+                args=func_then_else_arguments,
+                body=then_body + [ast.Return(value=return_list)],
+                decorator_list=[],
+            ),
+            node,
+        )
+        # Decorator keywords
+        decorator_keywords = [
+            ast.keyword(
+                arg="pred", value=test_expr
+            ),  # ast.Name(id="pred", ctx=ast.Load())
+            ast.keyword(
+                arg="write_args",
+                value=self.generate_get_locals_or_none_call(write_args),
+            ),
+        ]
+        # Create decorator
+        decorator = ast.copy_location(
+            ast.Call(
+                func=self._create_module_attribute(
+                    self.DECORATOR_IF_STATEMENT,
+                    lineno=node.lineno,
+                    col_offset=node.col_offset,
+                ),
+                args=[],
+                keywords=decorator_keywords,
+            ),
+            node,
+        )
+        # Executor keywords
+        execute_keywords = [
+            ast.keyword(arg="pred", value=ast.Name(id=pred_name, ctx=ast.Load())),
+            ast.keyword(
+                arg="write_args",
+                value=ast.List(
+                    elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in write_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="full_write_args_count",
+                value=ast.Constant(value=full_write_args_count),
+            ),
+            ast.keyword(
+                arg="write_args_names",
+                value=ast.List(
+                    elts=[ast.Constant(value=arg) for arg in write_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="then_block", value=ast.Name(id=then_block_name, ctx=ast.Load())
+            ),
+        ]
+        # Handle different cases
+        if not write_args and node.orelse == []:
+            # No write_args case - only then_block needed
+            execute_call = ast.copy_location(
+                ast.Call(
+                    func=self._create_module_attribute(
+                        self.IF_EXECUTOR, lineno=node.lineno, col_offset=node.col_offset
+                    ),
+                    args=[],
+                    keywords=execute_keywords,
+                ),
+                node,
+            )
+            func_body = [then_block, ast.Return(value=execute_call)]
+        else:
+            # Create else block based on node.orelse
+            if node.orelse:
+                if len(node.orelse) == 1 and isinstance(node.orelse[0], ast.If):
+                    # Handle elif case
+                    elif_node = node.orelse[0]
+                    nested_if_name = elif_region_name
+                    # Recursion for nested elif
+                    nested_if = self.create_if_function(
+                        nested_if_name, elif_node, write_args, full_write_args_count
+                    )
+                    else_block = ast.FunctionDef(
+                        name=else_block_name,
+                        args=func_then_else_arguments,
+                        body=[
+                            nested_if,
+                            ast.Return(
+                                value=ast.Name(id=nested_if_name, ctx=ast.Load())
+                            ),
+                        ],
+                        decorator_list=[],
+                    )
+                else:
+                    else_body = []
+                    for stmt in node.orelse:
+                        transformed_stmt = self.visit(
+                            stmt
+                        )  # Recursively visit inner statements
+                        if isinstance(transformed_stmt, list):
+                            else_body.extend(transformed_stmt)
+                        else:
+                            else_body.append(transformed_stmt)
+                    # Regular else block
+                    else_block = ast.FunctionDef(
+                        name=else_block_name,
+                        args=func_then_else_arguments,
+                        body=else_body + [ast.Return(value=return_list)],
+                        decorator_list=[],
+                    )
+            else:
+                # Default else block
+                else_block = ast.FunctionDef(
+                    name=else_block_name,
+                    args=func_then_else_arguments,
+                    body=[ast.Return(value=return_list)],
+                    decorator_list=[],
+                )
+            # Add else_block to execute keywords
+            execute_keywords.append(
+                ast.keyword(
+                    arg="else_block", value=ast.Name(id=else_block_name, ctx=ast.Load())
+                )
+            )
+            execute_call = ast.copy_location(
+                ast.Call(
+                    func=self._create_module_attribute(
+                        self.IF_EXECUTOR, lineno=node.lineno, col_offset=node.col_offset
+                    ),
+                    args=[],
+                    keywords=execute_keywords,
+                ),
+                node,
+            )
+            func_body = [
+                then_block,
+                ast.copy_location(else_block, node),
+                ast.Return(value=execute_call),
+            ]
+        return ast.copy_location(
+            ast.FunctionDef(
+                name=func_name,
+                args=func_decorator_arguments,
+                body=func_body,
+                decorator_list=[decorator],
+            ),
+            node,
+        )
+    def create_while_function(self, func_name, node, write_args, full_write_args_count):
+        """Create a while function that looks like:
+        @while_selector(pred, write_args=[])
+        def while_region(pred, write_args):
+            def while_before_block(*write_args):
+                # Note that during eval of pred can possibly alter yield_args
+                return *pred, write_args
+            def while_after_block(*write_args):
+                ...loop_body_transformed...
+                return write_args
+            return self.while_executor(pred, write_args,
+                while_before_block, while_after_block, constexpr)
+        write_args = while_region(pred, write_args)
+        Which will later be executed as psuedo-code:
+        # Dynamic mode:
+        scf.WhileOp(types(write_args), write_args)
+        with InsertionPoint(before_block):
+            cond, write_args = while_before_block(*write_args)
+            scf.ConditionOp(cond, write_args)
+        with InsertionPoint(after_block):
+            write_args = while_after_block(write_args)
+            scf.YieldOp(write_args)
+        return while_op.results_
+        # Const mode:
+        cond, write_args = while_before_block(write_args)
+        while pred:
+            write_args = body_block(write_args)
+            cond, write_args = while_before_block(write_args)
+        return write_args
+        """
+        test_expr = self.visit(node.test)
+        pred_name = self.make_func_param_name("pred", write_args)
+        # Section: decorator construction
+        decorator_keywords = [
+            ast.keyword(arg="pred", value=test_expr),
+            ast.keyword(
+                arg="write_args",
+                value=self.generate_get_locals_or_none_call(write_args),
+            ),
+        ]
+        decorator = ast.copy_location(
+            ast.Call(
+                func=self._create_module_attribute(
+                    self.DECORATOR_WHILE_STATEMENT,
+                    lineno=node.lineno,
+                    col_offset=node.col_offset,
+                ),
+                args=[],
+                keywords=decorator_keywords,
+            ),
+            node,
+        )
+        # Section: Shared initialization for before and after blocks
+        while_before_block_name = f"while_before_block_{self.counter}"
+        while_after_block_name = f"while_after_block_{self.counter}"
+        self.counter += 1
+        block_args_args = [ast.arg(arg=var, annotation=None) for var in write_args]
+        block_args = ast.arguments(
+            posonlyargs=[],
+            args=block_args_args,
+            kwonlyargs=[],
+            kw_defaults=[],
+            defaults=[],
+        )
+        yield_args_ast_name_list = ast.List(
+            elts=[ast.Name(id=var, ctx=ast.Load()) for var in write_args],
+            ctx=ast.Load(),
+        )
+        # Section: while_before_block FunctionDef, which contains condition
+        while_before_return_list = ast.List(
+            elts=[test_expr, yield_args_ast_name_list],
+            ctx=ast.Load(),
+        )
+        while_before_stmts = [ast.Return(value=while_before_return_list)]
+        while_before_block = ast.copy_location(
+            ast.FunctionDef(
+                name=while_before_block_name,
+                args=block_args,
+                body=while_before_stmts,
+                decorator_list=[],
+            ),
+            test_expr,
+        )
+        # Section: while_after_block FunctionDef, which contains loop body
+        while_after_stmts = []
+        for stmt in node.body:
+            transformed_stmt = self.visit(stmt)  # Recursively visit inner statements
+            if isinstance(transformed_stmt, list):
+                while_after_stmts.extend(transformed_stmt)
+            else:
+                while_after_stmts.append(transformed_stmt)
+        while_after_stmts.append(ast.Return(value=yield_args_ast_name_list))
+        while_after_block = ast.copy_location(
+            ast.FunctionDef(
+                name=while_after_block_name,
+                args=block_args,
+                body=while_after_stmts,
+                decorator_list=[],
+            ),
+            node,
+        )
+        # Section: Execute via executor
+        execute_keywords = [
+            ast.keyword(arg="pred", value=ast.Name(id=pred_name, ctx=ast.Load())),
+            ast.keyword(
+                arg="write_args",
+                value=ast.List(
+                    elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in write_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="full_write_args_count",
+                value=ast.Constant(value=full_write_args_count),
+            ),
+            ast.keyword(
+                arg="while_before_block",
+                value=ast.Name(id=while_before_block_name, ctx=ast.Load()),
+            ),
+            ast.keyword(
+                arg="while_after_block",
+                value=ast.Name(id=while_after_block_name, ctx=ast.Load()),
+            ),
+            ast.keyword(
+                arg="write_args_names",
+                value=ast.List(
+                    elts=[ast.Constant(value=arg) for arg in write_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+        ]
+        execute_call = ast.Call(
+            func=self._create_module_attribute(
+                self.WHILE_EXECUTOR, lineno=node.lineno, col_offset=node.col_offset
+            ),
+            args=[],
+            keywords=execute_keywords,
+        )
+        # Putting everything together, FunctionDef for while_region
+        func_args_args = [ast.arg(arg=pred_name, annotation=None)]
+        func_args_args += [ast.arg(arg=var, annotation=None) for var in write_args]
+        func_args = ast.arguments(
+            posonlyargs=[],
+            args=func_args_args,
+            kwonlyargs=[],
+            kw_defaults=[],
+            defaults=[],
+        )
+        return ast.copy_location(
+            ast.FunctionDef(
+                name=func_name,
+                args=func_args,
+                body=[
+                    while_before_block,
+                    while_after_block,
+                    ast.Return(value=execute_call),
+                ],
+                decorator_list=[decorator],
+            ),
+            node,
+        )

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/cache_helpers.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides jit cache load/dump helper functions
+"""
+import os
+import uuid
+import random
+import tempfile
+import pwd
+import time
+from pathlib import Path
+import hashlib
+from .utils.logger import log
+from .jit_executor import JitExecutor
+from .._mlir import ir
+# =============================================================================
+# Jit Cache Helper functions
+# =============================================================================
+def get_current_user():
+    # Try to get the user from the environment variable first
+    user = os.getenv("USER") or os.getenv("USERNAME")
+    if not user:
+        # Fallback for Unix-like systems
+        user = pwd.getpwuid(os.getuid()).pw_name
+    return user
+try:
+    default_generated_ir_path = f"/tmp/{get_current_user()}/cutlass_python_cache/"
+except Exception as e:
+    # If all else fails, provide a default fallback path
+    default_generated_ir_path = "/tmp/cutlass_python_cache/"
+    print(f"Could not determine user, using default path. Error: {e}")
+def load_ir(file, asBytecode=False):
+    """Load generated IR from a file."""
+    assert "mlir" in file
+    func_name = file.split(".mlir")[0].split("dsl_")[-1]
+    with ir.Context() as ctx:
+        with open(file, "rb" if asBytecode else "r") as f:
+            module = ir.Module.parse(f.read())
+    return func_name, module
+def make_unique_filename(fpath: Path, new_ext: str = None) -> Path:
+    """Generate a unique filename with an optional new extension."""
+    random_part = random.randint(0, 999999)
+    timestamp = time.time()
+    hash_input = f"{fpath}_{timestamp}_{random_part}".encode()
+    hash_code = hashlib.md5(hash_input).hexdigest()[:16]  # Shorter hash for readability
+    stem_with_hash = f"{fpath.stem}_{hash_code}"
+    return fpath.with_name(stem_with_hash).with_suffix(new_ext or fpath.suffix)
+def save_ir(
+    dsl_name: str,
+    module: object,
+    fname: str,
+    isTemp: bool = False,
+    asBytecode: bool = False,
+) -> str:
+    """Save generated IR to a file."""
+    initial_name = f"{dsl_name.lower()}_{fname}.mlir"
+    save_path = Path(tempfile.gettempdir() if isTemp else os.getcwd())
+    save_fname = save_path / initial_name
+    # Random ID to avoid any collisions
+    rnd_id = str(uuid.uuid4())
+    pid = os.getpid()
+    # use temp dir to be robust against program interruptions
+    temp_dir = os.path.join(save_path, f"tmp.pid_{pid}_{rnd_id}")
+    # If the process exits abnormally, may leave a temporary folder. Needs to be removed manually.
+    os.makedirs(temp_dir, exist_ok=False)
+    temp_fname = os.path.join(temp_dir, initial_name)
+    if asBytecode:
+        with open(temp_fname, "wb") as f:
+            module.operation.write_bytecode(f)
+    else:
+        with open(temp_fname, "w") as f:
+            print(module, file=f)
+    # os.replace is guaranteed to be atomic on POSIX systems if it succeeds
+    # so filepath cannot see a partial write
+    os.replace(temp_fname, save_fname)
+    os.removedirs(temp_dir)
+    log().debug("Generated IR saved into %s", save_fname)
+    return save_fname
+def check_func_name(jit_cache, func_name):
+    if not func_name in jit_cache:
+        jit_cache[func_name] = JitExecutor(None, None, None, None, None, None)
+    return jit_cache
+def load_cache_from_path(dsl_name, cache_limit, path=default_generated_ir_path):
+    """Load cache from a directory path."""
+    if not os.path.exists(path):
+        return dict()
+    files = os.listdir(path)
+    jit_cache = dict()
+    try:
+        for idx, file in enumerate(files):
+            if idx >= int(cache_limit):
+                break
+            # identify dsl prefix
+            if not file.startswith(f"{dsl_name.lower()}"):
+                continue
+            if ".mlir" in file:
+                func_name, ir_module = load_ir(
+                    os.path.join(path, file), asBytecode=True
+                )
+                jit_cache = check_func_name(jit_cache, func_name)
+                jit_cache[func_name].ir_module = ir_module
+    except Exception as e:
+        print(f"{dsl_name} failed with loading generated IR cache.", e)
+        jit_cache = dict()
+    return jit_cache
+def dump_cache_to_path(
+    dsl_name, jit_cache, cache_limit, path=default_generated_ir_path
+):
+    log().info("JIT cache : dumping [%s] items=[%s]", dsl_name, len(jit_cache))
+    os.makedirs(path, exist_ok=True)
+    original_path = os.getcwd()
+    try:
+        os.chdir(path)
+        for idx, [key, value] in enumerate(jit_cache.items()):
+            if idx >= int(cache_limit):
+                break
+            save_ir(dsl_name, value.ir_module, key, asBytecode=True)
+    except Exception as e:
+        print(f"{dsl_name} failed with caching generated IR", e)
+    finally:
+        os.chdir(original_path)

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/common.py ADDED Viewed

	@@ -0,0 +1,268 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+import os
+from typing import Any, Dict, Iterable, Optional, Union
+"""
+This module provides a Exception classes DSL class for any Dialect.
+"""
+# Add color codes at the top of the file after imports
+class Colors:
+    """ANSI color codes for error messages"""
+    RED = "\033[91m"
+    YELLOW = "\033[93m"
+    BLUE = "\033[94m"
+    GREEN = "\033[92m"
+    BOLD = "\033[1m"
+    RESET = "\033[0m"
+# =============================================================================
+# DSL Exceptions
+# =============================================================================
+class DSLBaseError(Exception):
+    """
+    Base exception for DSL-related errors.
+    Provides optional contextual metadata to aid in debugging.
+    """
+    def __init__(
+        self,
+        message: str,
+        line: Optional[int] = None,
+        snippet: Optional[str] = None,
+        filename: Optional[str] = None,
+        error_code: Optional[Union[str, int]] = None,
+        context: Optional[Union[Dict[str, Any], str]] = None,
+        suggestion: Optional[str] = None,
+        cause: Optional[BaseException] = None,
+    ) -> None:
+        self.message = message
+        self.line = line
+        self.filename = filename
+        self.snippet = snippet
+        self.error_code = error_code
+        self.context = context
+        self.suggestion = suggestion
+        self.cause = cause
+        super().__init__(self._format_message())
+    def _format_message(self):
+        """
+        Formats the complete error message with available metadata.
+        Override this in subclasses if you want to change formatting logic.
+        """
+        parts = [f"{self.__class__.__name__}: {self.message}"]
+        if self.error_code is not None:
+            parts.append(f"{Colors.BOLD}Error Code:{Colors.RESET} {self.error_code}\n")
+        if self.line is not None:
+            parts.append(f"  Line: {self.line}")
+        if self.filename is not None:
+            parts.append(f"  File: {self.filename}")
+        if self.snippet:
+            # Optionally truncate long snippets for readability
+            parts.append(f"  Snippet: \n {self.snippet}")
+        if self.cause:
+            parts.append(f"  Caused exception: {self.cause}")
+        if self.context:
+            if isinstance(self.context, dict):
+                parts.append(f"{Colors.BLUE}🔍 Additional Context:{Colors.RESET}\n")
+                for key, value in self.context.items():
+                    parts.append(f"    {key}: {value}")
+            else:
+                parts.append(
+                    f"{Colors.BLUE}🔍 Additional Context:{Colors.RESET} {self.context}"
+                )
+        if self.suggestion:
+            parts.append(f"{Colors.GREEN}💡 Suggestions:{Colors.RESET}")
+            if isinstance(self.suggestion, (list, tuple)):
+                for suggestion in self.suggestion:
+                    parts.append(f" {Colors.GREEN}{suggestion}{Colors.RESET}")
+            else:
+                parts.append(f" {self.suggestion}")
+        return "\n".join(parts)
+class DSLRuntimeError(DSLBaseError):
+    """
+    Raised when an error occurs during JIT-time code generation in the DSL.
+    """
+    # Inherits all logic from DSLBaseError; override methods if you need
+    # specialized behavior or formatting for runtime errors.
+    pass
+def _get_friendly_cuda_error_message(error_code, error_name):
+    # Avoid circular dependency
+    from .runtime.cuda import get_device_info
+    """Get a user-friendly error message for common CUDA errors."""
+    # Strip the byte string markers if present
+    if isinstance(error_name, bytes):
+        error_name = error_name.decode("utf-8")
+    elif (
+        isinstance(error_name, str)
+        and error_name.startswith("b'")
+        and error_name.endswith("'")
+    ):
+        error_name = error_name[2:-1]
+    # Add target architecture info
+    target_arch = os.getenv("CUTE_DSL_ARCH", "unknown")
+    error_messages = {
+        "CUDA_ERROR_INVALID_SOURCE": (
+            f"{Colors.RED}❌ Failed to load CUDA kernel - likely architecture mismatch.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_NO_BINARY_FOR_GPU": (
+            f"{Colors.RED}❌ CUDA kernel not compatible with your GPU.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_OUT_OF_MEMORY": (
+            f"{Colors.RED}💾 CUDA out of memory error.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_INVALID_DEVICE": (
+            f"{Colors.RED}❌ Invalid CUDA device.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_NOT_INITIALIZED": (
+            f"{Colors.RED}❌ CUDA context not initialized.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_INVALID_VALUE": (
+            f"{Colors.RED}⚠️ Invalid parameter passed to CUDA operation.{Colors.RESET}\n\n"
+            f"{Colors.YELLOW}This is likely a bug - please report it with:{Colors.RESET}"
+        ),
+    }
+    error_suggestions = {
+        "CUDA_ERROR_INVALID_SOURCE": (
+            f"1. Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            f"2. Clear the compilation cache and regenerate the kernel",
+            f"3. Check CUDA toolkit installation",
+        ),
+        "CUDA_ERROR_NO_BINARY_FOR_GPU": (
+            f"Set env CUTE_DSL_ARCH to match your GPU architecture",
+        ),
+        "CUDA_ERROR_OUT_OF_MEMORY": (
+            f"1. Reduce batch size",
+            f"2. Reduce model size",
+            f"3. Free unused GPU memory",
+        ),
+        "CUDA_ERROR_INVALID_DEVICE": (
+            f"1. Check if CUDA device is properly initialized",
+            f"2. Verify GPU is detected: nvidia-smi",
+            f"3. Check CUDA_VISIBLE_DEVICES environment variable",
+        ),
+        "CUDA_ERROR_NOT_INITIALIZED": (
+            f"1. Check CUDA driver installation",
+            f"2. call `cuda.cuInit(0)` before any other CUDA operation",
+            f"3. Run nvidia-smi to confirm GPU status",
+        ),
+        "CUDA_ERROR_INVALID_VALUE": (
+            f"1. Your GPU model",
+            f"2. SM ARCH setting",
+            f"3. Steps to reproduce",
+        ),
+    }
+    message = error_messages.get(
+        error_name, f"{Colors.RED}Unknown CUDA error{Colors.RESET}"
+    )
+    # Add debug information
+    debug_info = f"\n- {Colors.BOLD}Error name: {error_name}\n"
+    debug_info += f"- CUDA_TOOLKIT_PATH: {os.getenv('CUDA_TOOLKIT_PATH', 'not set')}\n"
+    debug_info += (
+        f"- Target SM ARCH: {os.getenv('CUTE_DSL_ARCH', 'not set')}{Colors.RESET}\n"
+    )
+    try:
+        # Get GPU information using CUDA Python API
+        debug_info += f"\n{Colors.BLUE}📊 GPU Information:{Colors.RESET}\n"
+        gpu_info = get_device_info()
+        debug_info += gpu_info.pretty_str()
+        if target_arch and gpu_info.compatible_archs:
+            debug_info += f"\n{Colors.BOLD}Compatibility Check:{Colors.RESET}\n"
+            if target_arch not in gpu_info.compatible_archs:
+                debug_info += (
+                    f"{Colors.RED}❌ Error: Target SM ARCH {target_arch} is not compatible\n"
+                    f"💡 Please use one of SM ARCHs: "
+                    f"{Colors.GREEN}{', '.join(gpu_info.compatible_archs or [])}{Colors.RESET}\n"
+                )
+            elif target_arch != gpu_info.sm_arch:
+                debug_info += (
+                    f"{Colors.YELLOW}⚠️  Warning: Using compatible but non-optimal architecture\n"
+                    f"• Current: {target_arch}\n"
+                    f"• Recommended: {Colors.GREEN}{gpu_info.sm_arch}{Colors.RESET} (native)\n"
+                )
+            else:
+                debug_info += f"{Colors.GREEN}✓ Using optimal architecture: {gpu_info.sm_arch}{Colors.RESET}\n"
+    except Exception as e:
+        debug_info += (
+            f"\n{Colors.YELLOW}ℹ️  Could not retrieve GPU info: {str(e)}{Colors.RESET}"
+        )
+    return message, debug_info, error_suggestions.get(error_name, "")
+class DSLCudaRuntimeError(DSLBaseError):
+    """
+    Raised when an error occurs during CUDA runtime code generation in the DSL.
+    """
+    # Inherits all logic from DSLRuntimeError; override methods if you need
+    # specialized behavior or formatting for runtime errors.
+    def __init__(self, error_code, error_name) -> None:
+        self._error_code = error_code
+        self._error_name = error_name
+        message, debug_info, suggestion = _get_friendly_cuda_error_message(
+            error_code, error_name
+        )
+        super().__init__(
+            message, error_code=error_code, context=debug_info, suggestion=suggestion
+        )
+class DSLAstPreprocessorError(DSLBaseError):
+    """
+    Raised when an error occurs during AST preprocessing or visiting in the DSL.
+    """
+    # Same approach: You could override _format_message if you want
+    # to emphasize AST node details or anything specific to preprocessing.
+    pass
+class DSLNotImplemented(DSLBaseError):
+    """
+    Raised when a feature of the DSL is not implemented yet.
+    """
+    # Useful for stubs in your DSL that you plan to implement in the future.
+    pass

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/compiler.py ADDED Viewed

	@@ -0,0 +1,288 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides a class that compiles generated IR using MLIR's PassManager
+and executes it using MLIR's ExecutionEngine.
+"""
+from typing import Sequence, Optional, Tuple
+import os
+import sys
+import inspect
+import argparse
+from .common import DSLRuntimeError
+from .utils.logger import log
+_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_PATH)
+from .._mlir import ir
+# =============================================================================
+# Compiler Class
+# =============================================================================
+class CompilationError(RuntimeError):
+    """Custom error class for compilation failures"""
+    # Add ANSI color codes
+    RED = "\033[91m"
+    YELLOW = "\033[93m"
+    BLUE = "\033[94m"
+    GREEN = "\033[92m"
+    BOLD = "\033[1m"
+    RESET = "\033[0m"
+    def __init__(
+        self,
+        message: str,
+        nvvm_error: Optional[str] = None,
+        ir_context: Optional[str] = None,
+        cuda_toolkit: Optional[str] = None,
+        arch: Optional[str] = None,
+    ):
+        self.nvvm_error = nvvm_error
+        self.ir_context = ir_context
+        self.cuda_toolkit = cuda_toolkit
+        self.arch = arch
+        # Call parent with formatted error to avoid showing class name
+        super().__init__("")  # Empty string to avoid class name
+        # Store formatted error for str() representation
+        self._formatted_error = self._format_error()
+    def __str__(self) -> str:
+        """Override string representation to avoid showing class name"""
+        return self._formatted_error
+    def __repr__(self) -> str:
+        """Override repr representation to avoid showing class name"""
+        return self._formatted_error
+    def _format_error(self) -> str:
+        if not self.nvvm_error:
+            return str(self.args[0])
+        return f"""NVVM Compilation Error:
+----------------------
+{self.BLUE}⚙️  Current Settings:{self.RESET}
+{self.BOLD}- CUDA Toolkit Path: {self.cuda_toolkit or "Not Set"}
+- Target Architecture: {self.arch}{self.RESET}
+IR Context (truncated):
+{self.ir_context}
+{self.YELLOW}💡 Possible Solutions:{self.RESET}
+{self.GREEN}1. Check if CUDA_TOOLKIT_PATH is set correctly
+2. Verify target architecture ({self.arch}) is supported by your CUDA toolkit
+3. Make sure CUDA toolkit version matches the target architecture requirements{self.RESET}"""
+class Compiler:
+    """Compiler class for compiling and building MLIR modules."""
+    def __init__(self, passmanager, execution_engine):
+        self.passmanager = passmanager
+        self.execution_engine = execution_engine
+    def __call__(self, module):
+        """Convenience application method."""
+        self.compile(module)
+    def _process_error(self, error_msg: str) -> Tuple[Optional[str], Optional[str]]:
+        """Process error message to extract NVVM error and IR context"""
+        nvvm_error = None
+        ir_msg = ""
+        if "NVVM_ERROR" in error_msg:
+            # Extract the specific NVVM error
+            nvvm_error = (
+                error_msg.split("libNVVM extra log:")[1].strip()
+                if "libNVVM extra log:" in error_msg
+                else error_msg
+            )
+            # Extract IR context
+            if "see current operation:" in error_msg:
+                # Get the IR section
+                ir_section = error_msg.split("see current operation:")[1].strip()
+                # Remove duplicate IR section
+                ir_section = ir_section.split("error: unknown: Failed translating")[
+                    0
+                ].strip()
+                # Get first few lines and last few lines of the IR
+                ir_lines = ir_section.split("\n")
+                if len(ir_lines) > 10:
+                    ir_msg = "\n".join(ir_lines[:5] + ["  ..."] + ir_lines[-5:])
+                else:
+                    ir_msg = ir_section
+        return nvvm_error, ir_msg
+    def compile(
+        self,
+        module,
+        pipeline: str,
+        cuda_toolkit: str = "",
+        arch: str = "",
+        enable_verifier=False,
+    ):
+        """Compiles the module by invoking the pipeline."""
+        try:
+            pm = self.passmanager.PassManager.parse(pipeline)
+            pm.enable_verifier(enable_verifier)
+            pm.run(module.operation)
+        except Exception as e:
+            error_msg = str(e)
+            nvvm_error, ir_msg = self._process_error(error_msg)
+            if nvvm_error:
+                raise CompilationError(
+                    error_msg,
+                    nvvm_error=nvvm_error,
+                    ir_context=ir_msg,
+                    cuda_toolkit=cuda_toolkit,
+                    arch=arch,
+                ) from e
+            raise e
+    def jit(self, module, opt_level: int = 2, shared_libs: Sequence[str] = ()):
+        """Wraps the module in a JIT execution engine."""
+        return self.execution_engine.ExecutionEngine(
+            module, opt_level=opt_level, shared_libs=shared_libs
+        )
+    def compile_and_jit(
+        self,
+        module,
+        pipeline: str,
+        shared_libs: Sequence[str] = (),
+        opt_level: int = 2,
+        cuda_toolkit: str = "",
+        arch: str = "",
+    ):
+        """Compiles and jits the module."""
+        self.compile(
+            module,
+            pipeline,
+            cuda_toolkit,
+            arch,
+        )
+        return self.jit(module, opt_level, shared_libs)
+class CompileOptions:
+    def __init__(self, options: str = ""):
+        """
+        This class encapsulates all compilation options relevant to function compilation.
+        It provides a convenient way to manage and pass compilation options,
+        particularly for controlling compilation settings.
+        By centralizing these options, it ensures consistent and flexible configuration of
+        compilation parameters such as optimization level, debugging control, etc.
+        :param options: The options for the function. Will be parsed by argparse.
+        :type options: str
+        """
+        if not isinstance(options, str):
+            raise DSLRuntimeError(
+                f"Invalid compilation `options`: {options}, it should be a string"
+            )
+        self._parser = argparse.ArgumentParser()
+        self._parser.add_argument("--opt-level", nargs="?", type=int, default=3)
+        self._parser.add_argument(
+            "--enable-device-assertions", action="store_true", default=False
+        )
+        self._parser.add_argument("--link-libraries", type=str, default="")
+        try:
+            self._options = self._parser.parse_args(options.split())
+        except SystemExit as e:
+            # catch argparse error and raise as DSLRuntimeError
+            raise DSLRuntimeError(
+                f"Invalid compile options: '{options}'. Please check the option values and format."
+            )
+        log().info("`cute.compile` CompileOptions: options=" + options)
+    def to_str(self):
+        """
+        Generate a string representation of all compilation options
+        which will be used in pipeline options.
+        """
+        option_strings = []
+        for key, value in vars(self._options).items():
+            hyphen_key = key.replace("_", "-")
+            if isinstance(value, bool):
+                formatted_value = "true" if value else "false"
+            else:
+                formatted_value = str(value)
+            option_strings.append(f"{hyphen_key}={formatted_value}")
+        return " ".join(option_strings)
+def compile(func, *args, **kwargs):
+    """
+    This function is used to compile a `cute.jit` decorated function.
+    It will process the compile options and input parameters, do explicit compilation and return  the jit executor.
+    :param func: The function to compile. It can be a regular function, a method or a class instance.
+    :param args: The arguments to pass to the function.
+    :param kwargs: The keyword arguments to pass to the function. It can contain `options` like
+    `opt_level` to control the compilation flags.
+    :return: The jit executor.
+    :raises: DSLRuntimeError if the function is not decorated with `cute.jit` or is not callable.
+    """
+    if func is None:
+        raise DSLRuntimeError("Function is not set or invalid.")
+    if not callable(func):
+        raise DSLRuntimeError("Object is not callable.")
+    kwargs["compile_only"] = True
+    kwargs["no_cache"] = True
+    if inspect.isfunction(func):
+        # regular function
+        pass
+    elif inspect.ismethod(func):
+        # if it's a method, add the instance to the first argument
+        args = [func.__self__] + list(args)
+        func = func.__func__
+    elif inspect.isclass(type(func)) and hasattr(func, "__call__"):
+        # If it's a class instance, get the class's __call__ method
+        args = [func] + list(args)
+        # Get the actual function from the class definition
+        func = func.__call__.__func__
+    else:
+        raise DSLRuntimeError(
+            "Invalid function type, only function, method and module are supported, but got",
+            func,
+        )
+    # If it's a wrapped function created by jit decorator, get the original function
+    if hasattr(func, "__wrapped__"):
+        func = func.__wrapped__
+    if not hasattr(func, "_dsl_object"):
+        raise DSLRuntimeError("Function is not decorated with jit decorator.")
+    # process compile options, extract the options and remove them from the kwargs
+    options = kwargs.pop("options", "")
+    func._dsl_object.compile_options = CompileOptions(options)
+    fcn_ptr = func._dsl_object._preprocess_and_execute(func)
+    return func._dsl_object._func(fcn_ptr, *args, **kwargs)

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/dsl.py ADDED Viewed

	@@ -0,0 +1,1686 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides a main DSL class for any Dialect.
+The DSL should be inherited as a new class, and its initialization requires dialects.
+It handles most of the mechanics for the DSL in an agnostic way,
+for example, it can handle various dialect-specific tasks.
+"""
+# Standard library imports
+from dataclasses import dataclass, field
+import atexit
+import os
+import io
+import sys
+import errno
+import ctypes
+import re
+import inspect
+import argparse
+import hashlib
+from functools import lru_cache, wraps
+from collections import namedtuple
+from abc import ABC, abstractmethod
+from typing import Any, Union, Tuple, get_origin, get_args, List
+from types import FunctionType, SimpleNamespace
+import warnings
+from . import typing as t
+from .env_manager import EnvironmentVarManager
+from .compiler import CompileOptions
+from .ast_helpers import DSLOptimizationWarning
+# =============================================================================
+# CUDA Python
+# =============================================================================
+from ..base_dsl._mlir_helpers.arith import const
+# =============================================================================
+# Local module imports
+# =============================================================================
+from .cache_helpers import *
+from .jit_executor import JitExecutor
+from .utils.timer import timer
+from .utils.logger import setup_log, log
+from .utils.stacktrace import filter_exception, walk_to_top_module, filter_stackframe
+from .runtime.jit_arg_adapters import is_argument_constexpr, JitArgAdapterRegistry
+from .ast_preprocessor import DSLPreprocessor
+from .common import *
+from .typing import (
+    get_c_pointers,
+    get_mlir_types,
+)
+# =============================================================================
+# MLIR modules
+# =============================================================================
+from .._mlir import ir
+from .._mlir import runtime as rt
+from .._mlir.extras import types as T
+from .._mlir.dialects import arith, math, func
+# =============================================================================
+# Global Variables
+# =============================================================================
+MLIR_DYNAMIC = -9223372036854775808
+# =============================================================================
+# Codegen Utils
+# =============================================================================
+def _numpy_type_to_mlir_type(dtype):
+    if dtype == np.float64:
+        return T.f64()
+    if dtype == np.float16:
+        return T.f16()
+    if dtype == np.float32:
+        return T.f32()
+    if dtype == np.int64:
+        return T.i64()
+    if dtype == np.int32:
+        return T.i32()
+    if dtype == np.int16:
+        return T.i16()
+    if dtype == np.int8:
+        return T.i8()
+    if dtype == np.uint64:
+        return T.ui64()
+    if dtype == np.uint32:
+        return T.ui32()
+    if dtype == np.uint16:
+        return T.ui16()
+    if dtype == np.uint8:
+        return T.ui8()
+    if dtype == np.bool_:
+        return T.bool()
+    if dtype == f8E5M2:
+        return T.f8E5M2()
+    if dtype == f8E4M3FN:
+        return T.f8E4M3FN()
+    if dtype == f8E8M0FNU:
+        return T.f8E8M0FNU()
+    if dtype == f6E3M2FN:
+        return T.f6E3M2FN()
+    if dtype == f6E2M3FN:
+        return T.f6E2M3FN()
+    if dtype == f4E2M1FN:
+        return T.f4E2M1FN()
+    assert False, f"Unknown type {type}"
+def _mlir_type_to_numpy_type(type):
+    if type == T.f64():
+        return np.float64
+    if type == T.f16():
+        return np.float16
+    if type == T.f32():
+        return np.float32
+    if type == T.i64():
+        return np.int64
+    if type == T.i32():
+        return np.int32
+    if type == T.i16():
+        return np.int16
+    if type == T.i8():
+        return np.int8
+    if type == T.ui64():
+        return np.uint64
+    if type == T.ui32():
+        return np.uint32
+    if type == T.ui16():
+        return np.uint16
+    if type == T.ui8():
+        return np.uint8
+    if type == T.bool():
+        return np.bool_
+    assert False, f"Unknown type {type}"
+# =============================================================================
+# Main DSL Class
+# =============================================================================
+def is_dynamic_expression(value):
+    """
+    Given the `value`, check if itself is an IR value or recursively go through it to check if it contains IR value
+    """
+    if isinstance(value, (tuple, list)):
+        for x in value:
+            if is_dynamic_expression(x):
+                return True
+    elif isinstance(value, (ir.Value, ir.BlockArgumentList)) or hasattr(
+        value, "__extract_mlir_values__"
+    ):
+        return True
+    return False
+def extract_mlir_values(obj):
+    """
+    Given the `obj`, recursively go through it to extract all contained IR values as list of MLIR values
+    """
+    res = []
+    if hasattr(obj, "__extract_mlir_values__"):
+        res = obj.__extract_mlir_values__()
+    elif isinstance(obj, (tuple, list)):
+        res = sum((extract_mlir_values(x) for x in obj), [])
+    elif isinstance(obj, SimpleNamespace):
+        res = []
+        for k, v in obj.__dict__.items():
+            res.extend(extract_mlir_values(v))
+    # Can't call is_dynamic_expression as _is_dynamic_expression depends on extract_mlir_values
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in extract_mlir_values to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    elif isinstance(obj, ir.Value):
+        res = [obj]
+    elif isinstance(obj, ir.BlockArgumentList):
+        res = list(obj)  # type: ignore
+    return res
+def new_from_mlir_values(obj, values):
+    """
+    Create a new python object by populating containing MLIR values with list of new values
+    """
+    if hasattr(obj, "__new_from_mlir_values__"):
+        return obj.__new_from_mlir_values__(values)
+    elif isinstance(obj, (tuple, list)):
+        res = []
+        for x in obj:
+            n_items = len(get_mlir_types(x))
+            res.append(new_from_mlir_values(x, values[:n_items]))
+            values = values[n_items:]
+        obj_ty = type(obj)
+        return obj_ty(res)
+    elif isinstance(obj, SimpleNamespace):
+        res = SimpleNamespace()
+        for k, v in obj.__dict__.items():
+            n_items = len(get_mlir_types(v))
+            res.__dict__[k] = new_from_mlir_values(v, values[:n_items])
+            values = values[n_items:]
+        return res
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in new_from_mlir_values to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    elif is_dynamic_expression(obj):
+        if len(values) == 0:
+            return obj
+        assert len(values) == 1
+        return values[0]
+    else:
+        assert len(values) == 0, f"{obj} expects 0 values, but got {values}"
+        return obj
+class DSLCallable:
+    """
+    Wrapper class for a callable object used within the DSL.
+    DSLCallable is designed to wrap a function and provide additional
+    introspection utilities such as retrieving the argument specification
+    and signature. It ensures that the wrapped function can only be called
+    once, after which the reference to the function is cleared to prevent
+    further invocations. This is useful in scenarios where a function should
+    only be executed a single time within the DSL's execution model.
+    Attributes:
+        func (callable): The function to be wrapped and managed.
+    Methods:
+        __call__(*args, **kwargs): Calls the wrapped function and clears it.
+    """
+    def __init__(self, func):
+        self.func = func
+    def __call__(self, *args, **kwargs):
+        ret = self.__func__(*args, **kwargs)
+        self.func = None
+        return ret
+    @property
+    def __func__(self):
+        assert self.func is not None, "DSLCallable is already called"
+        return self.func
+    @property
+    def __signature__(self):
+        return inspect.signature(self.__func__)
+    @property
+    def __name__(self):
+        return self.__func__.__name__
+class BaseDSL:
+    gpu_module = None
+    def __init__(
+        self,
+        *,
+        name: str,
+        dsl_package_name: List[str],
+        compiler_provider: Any,
+        pass_sm_arch_name: str,
+        device_compilation_only=False,
+        preprocess=False,
+    ):
+        """
+        Constructor for initializing the class with required providers and environment settings.
+        Parameters:
+        - name (str): Name of DSL, used for environment variables and logging.
+        - package_name (str): Name of the package, used for the preprocessor.
+        - compiler_provider (MLIR dialect): Provider for compiler.
+        - pass_sm_arch_name (str): The keyword name of the SM.
+        - device_compilation_only (bool) : Only device code, and call it via cuda driver
+        - preprocess (bool): Enable AST transformation.
+        This constructs a DSL instance and sets up environment management,
+        warning configurations, and logging functionalities. It reads
+        environment variables using `EnvironmentVarManager` and configures
+        a logger with settings from the environment. If environment warnings
+        are detected, they are escalated to errors to ensure strict handling.
+        """
+        # Enforcing initialization of instance variables
+        if not all([name, compiler_provider, pass_sm_arch_name]):
+            raise DSLRuntimeError(
+                "All required parameters must be provided and non-empty"
+            )
+        self.name = name
+        self.compiler_provider = compiler_provider
+        self.pass_sm_arch_name = pass_sm_arch_name
+        self.frame = None
+        self.no_cache = False
+        self.device_compilation_only = device_compilation_only
+        self.num_kernels = 0
+        # Read environment variables
+        self.envar = EnvironmentVarManager(self.name)
+        self.enable_preprocessor = preprocess
+        # This cache uses hash of original ir and env as key, allows dump/load to/from file. Enabled by default
+        self.jit_cache = (
+            dict()
+            if self.envar.disable_file_caching
+            else load_cache_from_path(self.name, self.envar.file_caching_capacity)
+        )
+        self.host_jit_decorator_name = f"@{BaseDSL.jit.__name__}"
+        self.device_jit_decorator_name = f"@{BaseDSL.kernel.__name__}"
+        # set warning
+        if not self.envar.enable_optimization_warnings:
+            # By default, optimization warnings are disabled
+            warnings.filterwarnings("ignore", category=DSLOptimizationWarning)
+        if self.envar.warnings_as_errors:
+            warnings.filterwarnings("error")
+        if self.envar.warnings_ignore:
+            warnings.filterwarnings("ignore")
+        # Initialize logger
+        if self.envar.log_to_console == False and self.envar.jitTimeProfiling:
+            self.envar.log_to_console = True
+            self.envar.log_level = 20  # info level
+        setup_log(
+            self.name,
+            self.envar.log_to_console,
+            self.envar.log_to_file,
+            f"{self.name}.log",
+            self.envar.log_level,
+        )
+        # kernel symbols are temporary symbol string variables, their values are valid until the compilation is done.
+        self.kernel_symbols = []
+        # used to generate unique name for gpu.launch
+        self.launch_inner_count = 0
+        # initialize default compile options
+        self.compile_options = CompileOptions()
+        if preprocess:
+            self.preprocessor = DSLPreprocessor(dsl_package_name)
+        log().info(f"Initializing {name} DSL")
+        log().debug(f"Logger initialized for {self.name}")
+        # Hook excepthook
+        if self.envar.filterStacktrace:
+            origin_excepthook = sys.excepthook
+            module_dir = walk_to_top_module(os.path.dirname(os.path.abspath(__file__)))
+            def excepthook(excep_type, value, traceback):
+                filter_exception(value, module_dir)
+                if hasattr(value, "__traceback__"):
+                    origin_excepthook(excep_type, value, value.__traceback__)
+                else:
+                    origin_excepthook(
+                        excep_type, value, filter_stackframe(traceback, module_dir)
+                    )
+            sys.excepthook = excepthook
+            # Restore original excepthook
+            def restore_excepthook(hook):
+                sys.excepthook = hook
+            atexit.register(restore_excepthook, origin_excepthook)
+    def dump_cache(self):
+        if not self.envar.disable_file_caching:
+            dump_cache_to_path(
+                self.name, self.jit_cache, self.envar.file_caching_capacity
+            )
+    @lru_cache(maxsize=1)
+    def print_warning_once(self, message):
+        log().warning(f"Warning: {message}")
+        warnings.warn(message, UserWarning)
+    def print_warning(self, message):
+        log().warning(f"Warning: {message}")
+        warnings.warn(message, UserWarning)
+    @classmethod
+    @lru_cache(maxsize=1)
+    def _get_dsl(cls):
+        # Instantiate the DSL Class once
+        main_dsl = cls()
+        if not main_dsl.no_cache:
+            # register atexit callback
+            atexit.register(main_dsl.dump_cache)
+        return main_dsl
+    @staticmethod
+    def _can_preprocess(**dkwargs):
+        """
+        Check if AST transformation is enabled or not for `jit` and `kernel` decorators.
+        """
+        return dkwargs.pop("preprocess", True)
+    @staticmethod
+    def _get_original_function(fcn_ptr, name):
+        """
+        Get the original function from the decorated function
+        """
+        while fcn_ptr.__name__ != name:
+            # If the function is wrapped with functools, get from __wrapped__
+            if hasattr(fcn_ptr, "__wrapped__"):
+                fcn_ptr = fcn_ptr.__wrapped__
+            # If the function is wrapped manually, it's the first in clousure
+            elif callable(fcn_ptr.__closure__[0].cell_contents):
+                fcn_ptr = fcn_ptr.__closure__[0].cell_contents
+            else:
+                raise DSLRuntimeError(
+                    f"Cannot find the original function {name} in the closure chain"
+                )
+        return fcn_ptr
+    @staticmethod
+    def _preprocess_and_execute(func):
+        """
+        Run ast transformation and return the materialized function pointer
+        """
+        if hasattr(func, "_transformed_ast"):
+            # If the function ptr is already materialized, use the existing one
+            func._dsl_object.frame = func._decorator_frame
+            if func._transformed_ast is None:
+                func._transformed_ast = func._dsl_object.run_preprocessor(func)
+                if func._transformed_ast is None:
+                    del func._transformed_ast
+                    func._dsl_object.frame = None
+                    return func
+            fcn_ptr = func._dsl_object.get_function_ptr(func)
+            # If the function is decorated, de-decorate it
+            fcn_ptr = BaseDSL._get_original_function(fcn_ptr, func.__name__)
+            func._dsl_object.frame = None
+            return DSLCallable(fcn_ptr)
+        return func
+    def jit_runner(self, executor, frame, *dargs, **dkwargs):
+        """
+        Decorator to mark a function for JIT compilation.
+        """
+        log().info("jit_runner")
+        def jit_runner_decorator(func):
+            func._dsl_object = self
+            # Run preprocessor that alters AST
+            if self.enable_preprocessor and BaseDSL._can_preprocess(**dkwargs):
+                # For an annotated function, add some DSL attributes
+                # When materializing the AST, we need decorator's frame
+                func._decorator_frame = frame
+                # No transformed ast at this point
+                func._transformed_ast = None
+            @wraps(func)
+            def jit_wrapper(*args, **kwargs):
+                func_ptr = BaseDSL._preprocess_and_execute(func)
+                return executor(func_ptr, *args, **kwargs)
+            return jit_wrapper
+        if len(dargs) == 1 and callable(dargs[0]):
+            return jit_runner_decorator(dargs[0])
+        else:
+            return jit_runner_decorator
+    @classmethod
+    def jit(cls, *dargs, **dkwargs):
+        """
+        Decorator to mark a function for JIT compilation for Host code.
+        """
+        frame = inspect.currentframe().f_back
+        # Instantiate the DSL Class
+        main_dsl = cls._get_dsl()
+        return main_dsl.jit_runner(main_dsl._func, frame, *dargs, **dkwargs)
+    @classmethod
+    def kernel(cls, *dargs, **dkwargs):
+        """
+        Decorator to mark a function for JIT compilation for GPU.
+        """
+        frame = inspect.currentframe().f_back
+        # Instantiate the DSL Class
+        main_dsl = cls._get_dsl()
+        return main_dsl.jit_runner(main_dsl._kernel_helper, frame, *dargs, **dkwargs)
+    @abstractmethod
+    def _kernel_helper(self, func, *args, **kwargs):
+        """
+        Helper function to handle kernel generation logic
+        """
+        pass
+    @abstractmethod
+    def _build_gpu_module(self, attrs):
+        """
+        Build the module op that contains the kernels.
+        """
+        pass
+    @abstractmethod
+    def _get_pipeline(self, pipeline):
+        """
+        Get the pipeline from the other configuration options.
+        """
+        if pipeline != None:
+            return pipeline
+        return None
+    @staticmethod
+    def log_additions(func_type, operands=None, types=None, arg_attrs=None):
+        if operands is not None and operands != []:
+            log().debug(
+                f"Added {func_type} operands: [%s]", ", ".join(map(str, operands))
+            )
+        if types is not None:
+            log().debug(
+                f"Added {func_type} arg_types: [%s]", ", ".join(map(str, types))
+            )
+        if arg_attrs is not None:
+            log().debug(
+                f"Added {func_type} arg_attrs: [%s]", ", ".join(map(str, arg_attrs))
+            )
+    def mangle_name(self, function_name, args, args_spec: inspect.FullArgSpec):
+        """Does simple name mangling"""
+        for spec_arg, arg in zip(args_spec.args, args):
+            spec_ty = args_spec.annotations.get(spec_arg, None)
+            if spec_ty != None:
+                if issubclass(type(spec_ty), (t.IRValue, t.IRVariadic)):
+                    continue
+                if isinstance(spec_ty, (ir.Type, ir.Value)):
+                    continue
+            if isinstance(arg, (ir.Type, ir.Value, ir.OpResult)):
+                continue
+            if isinstance(type(arg), (ir.Type, ir.Value, ir.OpResult)):
+                continue
+            if self._is_tensor_descriptor(arg):
+                continue
+            if inspect.isclass(spec_ty):
+                class_name = str(arg).replace("class", "")
+                class_name = class_name.replace(" ", "")
+                function_name = f"{function_name}_{class_name}"
+            elif isinstance(arg, (list, tuple)):
+                function_name = f"{function_name}_{'_'.join(map(str, arg))}"
+            else:
+                function_name = f"{function_name}_{arg}"
+        # we would need a dedicated MR to follow up
+        unwanted_chars = r"'-![]#,.<>()\":{}=%?@;"
+        translation_table = str.maketrans("", "", unwanted_chars)
+        function_name = function_name.translate(translation_table)
+        # identify address and drop
+        function_name = re.sub(r"0x[a-f0-9]{8,16}", "", function_name)
+        function_name = re.sub(r"\s+", " ", function_name)
+        function_name = function_name.replace(" ", "_")
+        function_name = function_name.replace("\n", "_")
+        # max fname is 256 character, leave space
+        function_name = function_name[:180]
+        log().info(f"Final mangled function name: {function_name}")
+        return function_name
+    def _generate_execution_arguments_for_known_types(
+        self, arg, arg_spec, arg_name, i, fop_args, iv_block_args
+    ):
+        """
+        Generate MLIR arguments for known types.
+        Sub-DSLs can override this method to handle types that are not
+        natively supported by the Base DSL.
+        """
+        ir_arg = []
+        if is_argument_constexpr(arg, arg_spec, arg_name, i, func):
+            ir_arg.append(arg)
+        return ir_arg, iv_block_args
+    def generate_execution_arguments(
+        self,
+        args,
+        kwargs,
+        fop,
+        args_spec: inspect.FullArgSpec,
+    ):
+        """Create list of arguments that will be passed to MLIR's func.func op"""
+        def gen_exec_args(input_args, arg_names, annotations, fop_args):
+            assert len(input_args) == len(arg_names)
+            ir_args = []
+            iv_block_args = 0
+            for i, arg in enumerate(input_args):
+                arg_name = arg_names[i]
+                arg_spec = annotations.get(arg_name, None)
+                log().debug("Processing [%d] Argument [%s : %s]", i, arg_name, arg_spec)
+                # Implicit cast to NumericMeta
+                if isinstance(arg_spec, t.NumericMeta) and not isinstance(
+                    arg, arg_spec
+                ):
+                    arg = t.cast(arg, arg_spec)
+                ir_arg, iv_block_args = (
+                    self._generate_execution_arguments_for_known_types(
+                        arg, arg_spec, arg_name, i, fop_args, iv_block_args
+                    )
+                )
+                if not ir_arg:
+                    # If it's not a known type, try JIT argument adapter
+                    # to convert the argument if possible
+                    adapter = JitArgAdapterRegistry.get_registered_adapter(type(arg))
+                    arg = adapter(arg) if adapter else arg
+                    n_args = len(get_mlir_types(arg))
+                    blk_args = fop_args[iv_block_args : iv_block_args + n_args]
+                    ir_arg.append(new_from_mlir_values(arg, blk_args))
+                    iv_block_args += n_args
+                self.log_additions(ir_arg)
+                ir_args.extend(ir_arg)
+            return ir_args, iv_block_args
+        fop_args = list(fop.regions[0].blocks[0].arguments)
+        ir_args, iv_block_args = gen_exec_args(
+            args, args_spec.args, args_spec.annotations, fop_args
+        )
+        ir_kwargs, _ = gen_exec_args(
+            [kwargs[arg] for arg in args_spec.kwonlyargs],
+            args_spec.kwonlyargs,
+            args_spec.annotations,
+            fop_args[iv_block_args:],
+        )
+        ir_kwargs = {k: v for k, v in zip(args_spec.kwonlyargs, ir_kwargs)}
+        log().debug("execution args: %s", ", ".join(map(str, ir_args)))
+        log().debug("execution kwargs: %s", ", ".join(map(str, ir_kwargs)))
+        return ir_args, ir_kwargs
+    @abstractmethod
+    def _generate_mlir_type_for_tensor_descriptor(self, tensor):
+        """
+        Generate MLIR type for the tensor descriptor.
+        """
+        pass
+    @abstractmethod
+    def _generate_executable_arg_for_tensor_descriptor(
+        self, mlir_value=None, ptr_tensor_ty=None, tensor=None
+    ):
+        """
+        Generates executable value for the given tensor descriptor.
+        """
+        pass
+    def _get_globals(self):
+        """
+        Combines global and local variables from the current context and the
+        caller's frame comes. This includes the current module's globals, the
+        global variables from the caller's frame, and the local variables from
+        the caller's frame.
+        "self.frame" is used to fetch the caller's frame.
+        AST preprocessor generates a new python code, so the resulting globals
+        dictionary is used to execute the python code.
+        """
+        all_globals = {}
+        if self.frame:
+            all_globals.update(self.frame.f_globals)
+            all_globals.update(self.frame.f_locals)
+        return all_globals
+    @abstractmethod
+    def _is_tensor_descriptor(self, maybe_tensor_descriptor) -> bool:
+        pass
+    @abstractmethod
+    def _handle_tensor_descriptor(
+        self, maybe_tensor, arg_name: str, need_gpu_memory: bool
+    ) -> Any:
+        pass
+    def _validate_arg(self, arg, arg_index, arg_name, arg_spec):
+        """
+        Validates if the arg is really of the annotated type for type safety.
+        The default implementation is empty. Subclasses can override this method to add more validation logic.
+        Returns None if validation passes, otherwise returns an error derived from DSLBaseError.
+        """
+        pass
+    def _generate_jit_func_args_for_known_types(
+        self,
+        func,
+        arg,
+        arg_name,
+        arg_spec,
+        arg_index,
+        *,
+        is_host=True,
+    ):
+        """
+        Generate JIT function arguments for known types.
+        Sub-DSLs can override this method to handle types that are not
+        natively supported by the Base DSL.
+        """
+        jit_arg_type, jit_arg_attr, jit_exec_arg = [], [], []
+        default_attr = ir.DictAttr.get({})
+        if is_argument_constexpr(arg, arg_spec, arg_name, arg_index, func):
+            jit_exec_arg = jit_arg_type = jit_arg_attr = None
+        return jit_exec_arg, jit_arg_type, jit_arg_attr
+    def _generate_jit_func_args(
+        self,
+        func,
+        function_name,
+        args,
+        kwargs,
+        args_spec: inspect.FullArgSpec,
+        *,
+        is_host=True,
+    ):
+        """Generate JIT function arguments."""
+        assert len(args) == len(args_spec.args) and len(kwargs) == len(
+            args_spec.kwonlyargs
+        ), (
+            f"Input args {len(args)=} and kwargs {len(kwargs)=} must match arg_spec.args "
+            f"{len(args_spec.args)=} and arg_spec.kwonlyargs {len(args_spec.kwonlyargs)=}"
+        )
+        jit_arg_types, jit_arg_attrs, jit_exec_args = [], [], []
+        jit_adapted_args = []
+        default_attr = ir.DictAttr.get({})
+        input_args = [*args, *kwargs.values()]
+        input_arg_names = [*args_spec.args, *args_spec.kwonlyargs]
+        for i, (arg_name, arg) in enumerate(zip(input_arg_names, input_args)):
+            spec_ty = args_spec.annotations.get(arg_name, None)
+            log().debug("Processing [%d] Argument [%s : %s]", i, arg_name, spec_ty)
+            # Implicitly convert into Numeric type if possible
+            if isinstance(spec_ty, t.NumericMeta) and not isinstance(arg, spec_ty):
+                arg = t.cast(arg, spec_ty)
+            # Type safety check
+            if spec_ty is not None:
+                err = self._validate_arg(arg, i, arg_name, spec_ty)
+                if err is not None:
+                    raise err
+            jit_exec_arg, jit_arg_type, jit_arg_attr = (
+                self._generate_jit_func_args_for_known_types(
+                    func,
+                    arg,
+                    arg_name,
+                    spec_ty,
+                    i,
+                    is_host=is_host,
+                )
+            )
+            if jit_arg_type is not None and len(jit_arg_type) == 0:
+                # If not any known type, try JIT argument adapter
+                # to convert the argument
+                adapter = JitArgAdapterRegistry.get_registered_adapter(type(arg))
+                if adapter:
+                    arg = adapter(arg)
+                    jit_adapted_args.append(arg)
+                if is_host:
+                    jit_exec_arg.extend(get_c_pointers(arg))
+                    jit_arg_type.extend(get_mlir_types(arg))
+                else:
+                    dyn_vals = extract_mlir_values(arg)
+                    jit_exec_arg.extend(dyn_vals)
+                    jit_arg_type.extend([v.type for v in dyn_vals])
+                if not jit_arg_type or not jit_exec_arg:
+                    if (is_host and hasattr(arg, "__c_pointers__")) or (
+                        not is_host
+                        and hasattr(arg, "__extract_mlir_values__")
+                        and hasattr(arg, "__new_from_mlir_values__")
+                    ):
+                        pass
+                    else:
+                        raise DSLRuntimeError(
+                            f"failed to generate argument #{i+1} ({arg_name}) for JIT function '{function_name}'.",
+                            context={
+                                f"Argument {arg_name}": "The DSL attempted to convert it into Dynamic Expression (aka MLIR values) but failed.",
+                                f"Call-site argument value": arg,
+                                f"Call-site argument type": type(arg),
+                            },
+                            suggestion=f"Consider annotating the argument with `{arg_name} : Constexpr` "
+                            "if it's a value known at compile-time. "
+                            f"Otherwise, implement the {'`JitArgument`' if is_host else '`DynamicExpression`'} "
+                            f"protocol or register a custom JIT argument adapter for type `{type(arg)}` to "
+                            "enable dynamic value conversion at runtime.",
+                        )
+                jit_arg_attr.extend([default_attr] * len(jit_arg_type))
+            if jit_arg_type is not None:
+                jit_exec_args.extend(jit_exec_arg)
+                jit_arg_types.extend(jit_arg_type)
+                jit_arg_attrs.extend(jit_arg_attr)
+        return jit_exec_args, jit_arg_types, jit_arg_attrs, jit_adapted_args
+    def generate_mlir_function_types(
+        self, func, function_name, input_args, kwargs, args_spec: inspect.FullArgSpec
+    ):
+        """Convert input arguments to MLIR function signature also convert numpy arrays to memref."""
+        exe_args, types, attrs, adapted_args = self._generate_jit_func_args(
+            func, function_name, input_args, kwargs, args_spec, is_host=True
+        )
+        log().debug("Execution Arguments: %s", ", ".join(map(str, exe_args)))
+        log().debug("Types: %s", ", ".join(map(str, types)))
+        assert len(exe_args) == len(
+            types
+        ), "expects the same number of arguments and function parameters"
+        return exe_args, types, adapted_args
+    @dataclass
+    class LaunchConfig:
+        cluster: list = None
+        grid: list = field(default_factory=lambda: [1, 1, 1])
+        block: list = field(default_factory=lambda: [1, 1, 1])
+        smem: int = None
+        async_deps: list = field(default_factory=list)
+        has_cluster: bool = False
+        min_blocks_per_mp: int = 0
+        auto_smem: bool = False
+        def __post_init__(self):
+            if len(self.grid) != 3:
+                raise DSLRuntimeError(f"Expect 3d grid!")
+            if len(self.block) != 3:
+                raise DSLRuntimeError(f"Expect 3d block!")
+            if self.smem is None:
+                self.smem = 0
+                self.auto_smem = True
+            self.has_cluster = self.cluster is not None
+            if self.cluster is None:
+                self.cluster = [None, None, None]
+            elif len(self.cluster) != 3:
+                raise DSLRuntimeError(f"Expect 3d cluster!")
+    def diagnostic(self):
+        """Check command line parameters and enables diagnostic"""
+        # Check command line arguments "-diagnostic"
+        parser = argparse.ArgumentParser(description="Process diagnostic status.")
+        parser.add_argument(
+            "-diagnostic",
+            nargs="?",
+            const="all",
+            choices=["all", "fail", "success", "info", "suggestion"],
+            help="Set diagnostic status (fail, success, info, suggestion).",
+        )
+        args, _ = parser.parse_known_args()
+        ctx = ir.Context.current
+        def callback(d):
+            print(f"  [{self.name} Diagnostic] : {d.message}")
+        ctx.attach_diagnostic_handler(callback)
+        # Early return, don't enable diagnostics
+        if args.diagnostic is None:
+            return
+        # Enable MLIR Flags
+        ctx.emit_error_diagnostics = True
+        ir._GlobalDebug.flag = True
+        if args.diagnostic == "all":
+            ir._GlobalDebug.set_types("diagnostic")
+        else:
+            ir._GlobalDebug.set_types(f"diagnostic-{args.diagnostic}")
+    def get_location(self):
+        """
+        Get python location information and generate MLIR location
+        """
+        if self.frame is None:
+            log().debug("Frame is None")
+            return None
+        file_loc = ir.Location.file(
+            self.frame.f_code.co_filename, self.frame.f_lineno, 0
+        )
+        loc = ir.Location.name(self.frame.f_code.co_name, childLoc=file_loc)
+        return loc
+    def compile_and_jit(self, module, pipeline, shared_libs, function_name=""):
+        """
+        Compile and JIT an MLIR module.
+        """
+        try:
+            self.diagnostic()
+            orig_stdout = sys.stdout
+            orig_stderr = sys.stderr
+            sys.stderr = redirect_stderr = io.StringIO()
+            sys.stdout = redirect_stdout = io.StringIO()
+            try:
+                kernel = self.compiler_provider.compile_and_jit(
+                    module,
+                    pipeline,
+                    shared_libs=shared_libs,
+                    cuda_toolkit=self.envar.cuda_toolkit,
+                    arch=self.envar.arch,
+                )
+            finally:
+                sys.stdout = orig_stdout
+                sys.stderr = orig_stderr
+                ir._GlobalDebug.flag = False
+            # Print captured output.
+            print(redirect_stdout.getvalue(), file=sys.stdout, end="")
+            print(redirect_stderr.getvalue(), file=sys.stderr, end="")
+            return kernel
+        except Exception as e:
+            raise DSLRuntimeError("🧊🧊🧊 ICE 🧊🧊🧊", cause=e)
+        finally:
+            pass
+    def preprocess_pipeline(self, pipeline, arch) -> str:
+        if self.envar.cuda_toolkit is None:
+            self.print_warning(
+                "CUDA_TOOLKIT_PATH environment variable is not set. Cannot set toolkitPath."
+            )
+        options = {
+            "toolkitPath": self.envar.cuda_toolkit if self.envar.cuda_toolkit else None,
+            self.pass_sm_arch_name: arch,
+        }
+        opt_str = ""
+        for k, v in options.items():
+            if v:
+                opt_str += f"{k}={v} "
+        if opt_str:
+            # Automatically append the pipeline options if any is specified through env var
+            pattern = re.compile(r"{(.+)}")
+            match = pattern.search(pipeline)
+            if match:
+                opt_str = f"{{{match[1]} {opt_str}}}"
+                pipeline = re.sub(r"{.+}", opt_str, pipeline)
+            else:
+                pipeline = pipeline.rstrip(")") + f"{{{opt_str}}})"
+        log().debug(f"Using pipeline = {pipeline}")
+        return pipeline
+    def get_shared_libs(self) -> list:
+        shared_libs = []
+        support_libs = self.envar.shared_libs
+        if support_libs is not None:
+            _libs = support_libs.split(":")
+            for lib in _libs:
+                if not os.path.exists(lib):
+                    raise FileNotFoundError(
+                        errno.ENOENT, os.strerror(errno.ENOENT), lib
+                    )
+                shared_libs.append(lib)
+        else:
+            self.print_warning(f"{self.name}_LIBS environment variable is not set")
+        return shared_libs
+    @lru_cache(maxsize=1)
+    def get_version(self):
+        version_hash = hashlib.sha256()
+        return version_hash
+    def get_module_hash(self, module, function_name):
+        s = io.BytesIO()
+        module.operation.write_bytecode(s)
+        for attr, value in self.envar.__dict__.items():
+            if value is not None:
+                s.write(str(value).encode())
+        # Add compile options to the hash
+        s.write(self.compile_options.to_str().encode())
+        module_hash = self.get_version().copy()
+        module_hash.update(s.getvalue())
+        module_hash = module_hash.hexdigest()
+        log().debug("Bytecode=[%s]", s.getvalue().hex())
+        log().debug("Version=[%s]", self.get_version().hexdigest())
+        log().info(
+            "Function=[%s] Computed module_hash=[%s]", function_name, module_hash
+        )
+        return module_hash
+    def build_module(self, module, function_name: str):
+        """
+        Build the MLIR module, verify and return the module
+        """
+        # Save IR in a file
+        if self.envar.keepIR:
+            save_ir(self.name, module, function_name)
+        if self.envar.printIR:
+            print("\n//===--- ------ Generated IR ------ ---====\n")
+            module.operation.print(
+                enable_debug_info=self.envar.generate_source_location
+            )
+            print("\n//===--- --- End of Generated IR -- ---====\n")
+        # Verify the module
+        try:
+            module.operation.verify()
+        except Exception as e:
+            raise DSLRuntimeError(f"🧊🧊🧊 ICE IR Verification Failed 🧊🧊🧊", cause=e)
+        return module
+    def generate_original_ir(
+        self,
+        ir,
+        func,
+        funcBody,
+        kwargs,
+        function_name,
+        func_types,
+        gpu_module_attrs,
+        args,
+        args_spec,
+    ):
+        # This location is set to None for now; otherwise, calls to the same
+        # function on different lines would produce different line numbers,
+        # which would break the cache.
+        loc = None  # self.get_location()
+        def build_ir_module():
+            module = ir.Module.create(loc=loc)
+            unit_attr = ir.UnitAttr.get()
+            module.operation.attributes["gpu.container_module"] = unit_attr
+            with ir.InsertionPoint(module.body):
+                # Always generate gpu module. It's canonicalized by the compiler when it's not used.
+                self._build_gpu_module(gpu_module_attrs)
+                fop = func.FuncOp(function_name, (func_types, []), loc=loc)
+                fop.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+                log().debug("Generated Function OP [%s]", fop)
+                with ir.InsertionPoint(fop.add_entry_block()):
+                    ir_args, ir_kwargs = self.generate_execution_arguments(
+                        args, kwargs, fop, args_spec
+                    )
+                    # Call user function body
+                    try:
+                        result = funcBody(*ir_args, **ir_kwargs)
+                        func.ReturnOp([])
+                    except NameError as name_error:
+                        raise DSLRuntimeError(
+                            f"💥💥💥 Error during runtime code generation for function `{funcBody.__name__}` 💥💥💥",
+                            cause=name_error,
+                            suggestion="Using variables defined in dynamic control flow is not supported. Please give an initial value before control flow.",
+                        )
+                    except DSLRuntimeError as dsl_error:
+                        # Throw it's already a DSL error
+                        raise dsl_error
+            return module, result
+        # Build IR module
+        profiler = timer(enable=self.envar.jitTimeProfiling)
+        module, result = profiler(build_ir_module)()
+        module_hash = self.get_module_hash(module, function_name)
+        module = self.build_module(module, function_name)
+        return module, module_hash, result
+    def compile_and_cache(
+        self, module, module_hash, function_name, pipeline, args_spec, no_cache
+    ):
+        arch = self.envar.arch
+        pipeline = self.preprocess_pipeline(self._get_pipeline(pipeline), arch)
+        shared_libs = self.get_shared_libs()
+        profiler = timer(enable=self.envar.jitTimeProfiling)
+        if (
+            no_cache
+            or module_hash not in self.jit_cache
+            or self.jit_cache[module_hash].ir_module is None
+        ):
+            log().info(
+                "JIT cache miss function=[%s] module_hash=[%s]",
+                function_name,
+                module_hash,
+            )
+            # Compile and JIT MLIR module
+            engine = profiler(self.compile_and_jit)(
+                module, pipeline, shared_libs, function_name=function_name
+            )
+        else:
+            log().info(
+                "JIT cache hit IN-FILE function=[%s] module_hash=[%s]",
+                function_name,
+                module_hash,
+            )
+            module = self.jit_cache[module_hash].ir_module
+            engine = self.compiler_provider.jit(module, shared_libs=shared_libs)
+        capi_func = profiler(engine.lookup)(function_name)
+        jit_executor = JitExecutor(
+            self,
+            engine,
+            capi_func,
+            module,
+            args_spec,
+            function_name,
+            jit_time_profiling=self.envar.jitTimeProfiling,
+        )
+        jit_executor = jit_executor.update_jit_cuda_modules(self.kernel_symbols)
+        if not no_cache:
+            # module stored in cache is compiled.
+            self.jit_cache[module_hash] = jit_executor
+        return jit_executor
+    def post_compilation_cleanup(self):
+        """Clean up some internal state after one compilation is completed."""
+        # clear the kernel symbols after the compilation is done.
+        self.kernel_symbols = []
+        self.launch_inner_count = 0
+        # reset num_kernels to 0 for next compilation.
+        self.num_kernels = 0
+        # reset the compile options after the compilation is done.
+        self.compile_options = CompileOptions()
+    def generate_mlir(
+        self,
+        funcBody,
+        kwargs,
+        function_name,
+        gpu_module_attrs,
+        args,
+        args_spec,
+        pipeline,
+        no_cache,
+        compile_only,
+        loc=None,
+    ):
+        """Generate MLIR module and compile iself.T_provider."""
+        with ir.Context(), ir.Location.unknown():
+            # Convert input arguments to MLIR arguments
+            exe_args, func_types, adapted_args = self.generate_mlir_function_types(
+                funcBody, function_name, args, kwargs, args_spec
+            )
+            # Generate original ir module and its hash value.
+            module, module_hash, result = self.generate_original_ir(
+                ir,
+                func,
+                funcBody,
+                kwargs,
+                function_name,
+                func_types,
+                gpu_module_attrs,
+                args,
+                args_spec,
+            )
+            # dryrun is used to only generate IR
+            if self.envar.dryrun:
+                return result
+            if (
+                no_cache
+                or module_hash not in self.jit_cache
+                or self.jit_cache[module_hash].capi_func is None
+            ):
+                # no cache or cache miss, do ir generation/compilation/jit engine
+                jit_executor = self.compile_and_cache(
+                    module, module_hash, function_name, pipeline, args_spec, no_cache
+                )
+            else:
+                # cache hit
+                log().info(
+                    "JIT cache hit IN-MEMORY function=[%s] module_hash=[%s]",
+                    function_name,
+                    module_hash,
+                )
+                jit_executor = self.jit_cache[module_hash]
+            self.post_compilation_cleanup()
+        # If compile_only is set, bypass execution return the jit_executor directly
+        if compile_only:
+            return jit_executor
+        # Run the compiled program
+        jit_executor.run_compiled_program(exe_args)
+        return result
+    def run_preprocessor(self, funcBody):
+        if not hasattr(funcBody, "_preprocessed"):
+            function_name = funcBody.__name__
+            self.funcBody = funcBody
+            log().info("Started preprocessing [%s]", function_name)
+            exec_globals = self._get_globals()
+            transformed_ast = self.preprocessor.transform(funcBody, exec_globals)
+            if self.envar.print_after_preprocessor:
+                log().info(
+                    f"# Printing unparsed AST after preprocess of func=`{function_name}` id=`{id(funcBody)}`"
+                )
+                DSLPreprocessor.print_ast(transformed_ast)
+            funcBody._preprocessed = True
+            return transformed_ast
+        return None
+    def get_function_ptr(self, original_function):
+        file_name = inspect.getsourcefile(original_function)
+        code_object = compile(
+            original_function._transformed_ast, filename=file_name, mode="exec"
+        )
+        return self.preprocessor.exec(
+            original_function.__name__,
+            original_function,
+            code_object,
+            self._get_globals(),
+        )
+    def _get_function_bound_args(self, sig, func_name, *args, **kwargs):
+        """
+        Binds provided arguments to a function's signature and applies default values.
+        E.g. given a function signature `def foo(a, b=2, c=3)`, and at call-site if we do
+        `foo(a=1, c=4)`, the returned BoundArguments object will have args = `[1]`
+        and kwargs = `{'b': 2, 'c': 4}`
+        An exception will be raised if binding fails.
+        """
+        try:
+            bound_args = sig.bind_partial(*args, **kwargs)
+            bound_args.apply_defaults()
+        except Exception as e:
+            raise DSLRuntimeError(
+                f"Failed to bind arguments to function `{func_name}` with signature `{sig}`",
+                cause=e,
+            )
+        return bound_args
+    def _canonicalize_args(self, sig, *args, **kwargs):
+        """
+        Canonicalize the input arguments so that returned args only contain
+        positional arguments and kwargs only contain keyword arguments.
+        """
+        function_name = self.funcBody.__name__
+        bound_args = self._get_function_bound_args(sig, function_name, *args, **kwargs)
+        canonicalized_args = bound_args.args
+        canonicalized_kwargs = bound_args.kwargs
+        return canonicalized_args, canonicalized_kwargs
+    def _check_arg_count(self, *args, **kwargs):
+        if not self.funcBody:
+            raise DSLRuntimeError("Function body is not set.")
+        # Pass the actual function object to inspect.signature to get the signature.
+        sig = inspect.signature(self.funcBody)
+        function_name = self.funcBody.__name__
+        bound_args = self._get_function_bound_args(sig, function_name, *args, **kwargs)
+        # Check if all non-default arguments are provided
+        for param in sig.parameters.values():
+            if (
+                param.default is inspect.Parameter.empty
+                and param.name not in bound_args.arguments
+            ):
+                raise DSLRuntimeError(
+                    f"Missing required argument in `{function_name}`: '{param.name}'"
+                )
+        return sig
+    def _func(self, funcBody, *args, **kwargs):
+        """Decorator for MLIR functions.
+        It cuts the boilerplate code, does the following:
+            1. Generates `func.func`
+            2. Types translation (numpy arrays -> cute.memref, float -> <f32>, etc.)
+            3. Compiles and JITs the MLIR module
+            4. Invokes the generated function
+            5. Operator overloading (a + b --> arith.addi a, b)
+            6. Generates GPU kernel function with GPU module and kernel attributes baked
+        """
+        if ir.Context.current is None:
+            pass
+        elif ir.InsertionPoint.current is not None:
+            return funcBody(*args, **kwargs)
+        function_name = funcBody.__name__
+        self.funcBody = funcBody
+        pipeline = kwargs.pop("pipeline", None)
+        gpu_module_attrs = kwargs.pop("gpu_module_attrs", {})
+        # Disable cache
+        no_cache = kwargs.pop("no_cache", False)
+        # Always compile(disable cache) and return the result jit_executor
+        compile_only = kwargs.pop("compile_only", False)
+        if not no_cache and compile_only:
+            no_cache = True
+            self.print_warning("Cache is disabled as user wants to compile only.")
+        # Check the number of arguments
+        sig = self._check_arg_count(*args, **kwargs)
+        args_spec = inspect.getfullargspec(funcBody)
+        # Canonicalize the input arguments
+        canonicalized_args, canonicalized_kwargs = self._canonicalize_args(
+            sig, *args, **kwargs
+        )
+        # Simple name mangling
+        function_name = self.mangle_name(function_name, canonicalized_args, args_spec)
+        # Generate MLIR Context and start generating IR
+        log().debug(f"Generating MLIR for function '{function_name}'")
+        result = self.generate_mlir(
+            funcBody,
+            canonicalized_kwargs,
+            function_name,
+            gpu_module_attrs,
+            canonicalized_args,
+            args_spec,
+            pipeline,
+            no_cache,
+            compile_only,
+        )
+        return result
+    class _KernelGenHelper(ABC):
+        def __init__(self):
+            self.func_op = None
+            self.func_type = None
+        @abstractmethod
+        def generate_func_op(self, arg_types, arg_attrs, kernel_name, loc=None):
+            assert arg_types is not None, "Invalid arg_types!"
+            assert kernel_name is not None, "kernel name is empty"
+            pass
+        @abstractmethod
+        def generate_func_ret_op(self):
+            pass
+        @abstractmethod
+        def generate_launch_op(self, *args, **kwargs):
+            pass
+        @abstractmethod
+        def get_func_body_start(self):
+            pass
+    @abstractmethod
+    def enter_gpu_module(module):
+        """Compute the insertion point into the given module."""
+        pass
+    @lru_cache(maxsize=1)
+    def _get_default_stream(self):
+        """Returns the default stream 0"""
+        from .runtime import cuda as cuda_helpers
+        return cuda_helpers.stream_create()
+    def _execute_cuda(
+        self, fname_cubin, kernel_name, grid_size, block_size, smem_size, stream=None
+    ):
+        """
+        Executes a specified CUDA kernel from a cubin file, handling module loading,
+        kernel retrieval, stream creation, kernel launch, and synchronization.
+        """
+        from .runtime import cuda as cuda_helpers
+        # Step 1. Load CUDA Module
+        module = cuda_helpers.load_cubin_module(fname_cubin)
+        # Step 2. Find CUDA function
+        kernel_ptr = cuda_helpers.get_kernel_function(module, kernel_name)
+        sync_execution_default = False
+        if stream is None:
+            stream = self._get_default_stream()
+            sync_execution_default = True
+        # Step 4. Launch the kernel
+        cuda_helpers.launch_kernel(
+            kernel_ptr,
+            grid_size,
+            block_size,
+            stream,
+            smem_size=smem_size,
+            kernel_args=self.exe_args,
+        )
+        if sync_execution_default:
+            # Step 5. Optional Sync cuda stream
+            cuda_helpers.stream_sync(stream)
+    def _execute_by_cuda_driver(
+        self,
+        kernel_generator,
+        generate_cubin,
+        grid_size,
+        block_size,
+        smem_size,
+        stream=None,
+    ):
+        """
+        This function builds IR and execute the module using cuda driver.
+        It doesn't use mlir's cuda runtime
+        """
+        ret = None
+        # Step 1. Build IR
+        with ir.Context(), ir.Location.unknown():
+            loc = self.get_location()
+            module = ir.Module.create(loc=loc)
+            unit_attr = ir.UnitAttr.get()
+            module.operation.attributes["gpu.container_module"] = unit_attr
+            with ir.InsertionPoint(module.body):
+                self._build_gpu_module()
+                ret, kernel_name = kernel_generator()
+                log().debug(
+                    f"Kernel generator returned: ret={ret}, kernel_name={kernel_name}"
+                )
+        module = self.build_module(module, kernel_name)
+        # dryrun is used to only generate IR
+        if self.envar.dryrun:
+            return ret
+        # Generate cubin
+        fname_cubin = generate_cubin(module, kernel_name)
+        # Execute a cuda kernel from cubin
+        self._execute_cuda(
+            fname_cubin, kernel_name, grid_size, block_size, smem_size, stream
+        )
+        return ret
+    def generate_kernel_operands_and_types(
+        self, kernel_func, kernel_name, args_spec, args, kwargs
+    ):
+        """
+        Generate the operands and types for the kernel function
+        """
+        kernel_operands, kernel_arg_types, kernel_arg_attrs = [], [], []
+        log().debug(
+            "Processing GPU kernel call in [%s] mode",
+            (
+                f"Only {self.device_jit_decorator_name}"
+                if self.device_compilation_only
+                else f"{self.host_jit_decorator_name} + {self.device_jit_decorator_name}"
+            ),
+        )
+        if self.device_compilation_only:
+            return kernel_operands, kernel_arg_types, kernel_arg_attrs
+        kernel_operands, kernel_arg_types, kernel_arg_attrs, _ = (
+            self._generate_jit_func_args(
+                kernel_func, kernel_name, args, kwargs, args_spec, is_host=False
+            )
+        )
+        log().debug("Final kernel_operands: %s", ", ".join(map(str, kernel_operands)))
+        log().debug("Final kernel_arg_types: %s", ", ".join(map(str, kernel_arg_types)))
+        log().debug("Final kernel_arg_attrs: %s", ", ".join(map(str, kernel_arg_attrs)))
+        assert (
+            len(kernel_operands) == len(kernel_arg_types) == len(kernel_arg_attrs)
+        ), "Size of kernel_operands, kernel_arg_types and kernel_arg_attrs must be equal"
+        return kernel_operands, kernel_arg_types, kernel_arg_attrs
+    def kernel_launcher(self, *dargs, **dkwargs):
+        def decorator(funcBody):
+            @wraps(funcBody)
+            def kernel_wrapper(*args, **kwargs):
+                """
+                Base decorator for generating kernel function
+                This decorator provides a template for kernel function generation
+                including kernel function header/body and kernel launch op at call site
+                Optional arguments (with default value in <>):
+                  - requiredArgs <[]>:      specifies the mandatory arguments that must present in kernel function signature
+                                            the args will be validated and collected as a namedtuple
+                  - optionalArgs <[]>:      specifies the optional arguments that might present in kernel function signature
+                                            the args will be collected (if present) as a namedtuple
+                  - unitAttrNames <[]>:     specifies the name(s) of ir.UnitAttr to be set for kernel function op
+                  - valueAttrDict <{}>:     specifies the name(s) and value(s) of ir.Attribute to be set for kernel function op
+                  - kernelGenHelper <None>: specifies the mandatory customized kernel generation helper class (derived from _KernelGenHelper)
+                Return value:
+                  A namedtuple "KernelReturns" is returned with following fields:
+                  - kernel_func_ret: the return of the kernel function
+                  - launch_op_ret:   the return of the launch op
+                """
+                requiredArgs = dkwargs.get("requiredArgs", [])
+                optionalArgs = dkwargs.get("optionalArgs", [])
+                unitAttrNames = dkwargs.get("unitAttrNames", [])
+                valueAttrDict = dkwargs.get("valueAttrDict", {})
+                kernelGenHelper = dkwargs.get("kernelGenHelper", None)
+                kernel_name = funcBody.__name__
+                args_spec = inspect.getfullargspec(funcBody)
+                self.funcBody = funcBody
+                # Give each kernel a unique name. (The same kernel may be
+                # called multiple times, resulting in multiple kernel traces.)
+                # The mangled name of Python function is part of the name to
+                # improve readability.
+                kernel_name = f"kernel_{self.mangle_name(kernel_name, args, args_spec)}_{self.num_kernels}"
+                self.num_kernels += 1
+                # Step 0. Preprocess the arguments
+                def extract_args(argNames, assertIfNone=False) -> list:
+                    extracted = []
+                    for name in argNames:
+                        value = kwargs.pop(name, None)
+                        if assertIfNone and value is None:
+                            raise DSLRuntimeError(
+                                f"{name} is required for {kernel_name}"
+                            )
+                        extracted.append(value)
+                    return extracted
+                RequiredArgs = namedtuple("RequiredArgs", requiredArgs)
+                req_args = (
+                    RequiredArgs._make(extract_args(requiredArgs, assertIfNone=True))
+                    if requiredArgs
+                    else None
+                )
+                OptionalArgs = namedtuple("OptionalArgs", optionalArgs)
+                opt_args = (
+                    OptionalArgs._make(extract_args(optionalArgs))
+                    if optionalArgs
+                    else None
+                )
+                assert (
+                    kernelGenHelper is not None
+                ), "kernelGenHelper should be explicitly specified!"
+                # check arguments
+                sig = self._check_arg_count(*args, **kwargs)
+                # Canonicalize the input arguments
+                canonicalized_args, canonicalized_kwargs = self._canonicalize_args(
+                    sig, *args, **kwargs
+                )
+                kernel_operands, kernel_types, kernel_arg_attrs = (
+                    self.generate_kernel_operands_and_types(
+                        funcBody,
+                        kernel_name,
+                        args_spec,
+                        canonicalized_args,
+                        canonicalized_kwargs,
+                    )
+                )
+                with self._enter_gpu_module():
+                    log().debug("Generating device kernel")
+                    if self.device_compilation_only:
+                        log().debug("Generating cuda-python arguments")
+                        # Convert input arguments to MLIR arguments
+                        self.exe_args, kernel_types, _ = (
+                            self.generate_mlir_function_types(
+                                funcBody,
+                                kernel_name,
+                                canonicalized_args,
+                                canonicalized_kwargs,
+                                args_spec,
+                            )
+                        )
+                    helper = kernelGenHelper()
+                    loc = self.get_location()
+                    fop = helper.generate_func_op(
+                        kernel_types, kernel_arg_attrs, kernel_name, loc
+                    )
+                    log().debug(f"Kernel function op: {fop}")
+                    for attr in unitAttrNames:
+                        fop.attributes[attr] = ir.UnitAttr.get()
+                    for key, val in valueAttrDict.items():
+                        fop.attributes[key] = val
+                    fop.sym_visibility = ir.StringAttr.get("public")
+                    with ir.InsertionPoint(helper.get_func_body_start()):
+                        ir_args, ir_kwargs = self.generate_execution_arguments(
+                            canonicalized_args, canonicalized_kwargs, fop, args_spec
+                        )
+                        log().debug(
+                            f"IR arguments - args: {ir_args} ; kwargs: {ir_kwargs}"
+                        )
+                        # Call user function body
+                        kernel_ret = funcBody(*ir_args, **ir_kwargs)
+                        helper.generate_func_ret_op()
+                # Step 3. Generate call site `launch_func`
+                kernel_sym = ir.SymbolRefAttr.get(["kernels", kernel_name])
+                launch_ret = helper.generate_launch_op(
+                    kernelSym=kernel_sym,
+                    kernelOperands=kernel_operands,
+                    requiredArgs=req_args,
+                    optionalArgs=opt_args,
+                )
+                KernelReturns = namedtuple(
+                    "KernelReturns", ["kernel_func_ret", "launch_op_ret"]
+                )
+                result = KernelReturns(
+                    kernel_func_ret=kernel_ret, launch_op_ret=launch_ret
+                )
+                log().debug(f"Kernel result: {result}, kernel name: {kernel_name}")
+                return result, kernel_name
+            return kernel_wrapper
+        if len(dargs) == 1 and callable(dargs[0]):
+            return decorator(dargs[0])
+        else:
+            return decorator

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/env_manager.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides utilities for the environment variables setup.
+It provides an EnvironmentVarManager, which reads environment variables for the DSL
+and caches them for efficient access.
+It also provides utilities to automatically setup a subset of environment variables
+based on heuristics.
+"""
+import os
+import sys
+import shutil
+import glob
+from pathlib import Path
+from functools import lru_cache
+from typing import Any
+from ..base_dsl.runtime.cuda import get_compute_capability_major_minor
+from .utils.logger import log
+IS_WINDOWS = sys.platform == "win32"
+CLIB_EXT = ".dll" if IS_WINDOWS else ".so"
+# =============================================================================
+# Environment Variable Helpers
+# =============================================================================
+@lru_cache(maxsize=None)
+def get_str_env_var(var_name, default_value=None):
+    value = os.getenv(var_name)
+    return value if value is not None else default_value
+@lru_cache(maxsize=None)
+def get_bool_env_var(var_name, default_value=False):
+    value = get_str_env_var(var_name)
+    if value is None:
+        return default_value
+    return value not in {"False", "0", ""}
+@lru_cache(maxsize=None)
+def get_int_env_var(var_name, default_value=0):
+    value = get_str_env_var(var_name)
+    return int(value) if value and value.isdigit() else default_value
+@lru_cache(maxsize=None)
+def has_env_var(var_name):
+    return os.getenv(var_name) is not None
+def detect_gpu_arch(prefix):
+    """
+    Attempts to detect the machine's GPU architecture.
+    Returns:
+        A string representing the GPU architecture (e.g. "70" for compute capability 7.0),
+        or a default value(e.g. "sm_100") if the GPU architecture cannot be determined.
+    """
+    arch = (None, None)
+    try:
+        arch = get_compute_capability_major_minor()
+    except Exception as e:
+        log().info(f"Failed to get CUDA compute capability: {e}")
+    if arch == (None, None):
+        # default to sm_100
+        arch = (10, 0)
+    major, minor = arch
+    suffix = ""
+    if major >= 9:
+        suffix = "a"
+    return f"sm_{major}{minor}{suffix}"
+def find_libs_in_ancestors(start, target_libs, lib_folder_guesses):
+    """
+    Search ancestor directories for a candidate library folder containing all required libraries.
+    Starting from the given path, this function traverses up through each parent directory.
+    For every ancestor, it checks candidate subdirectories (specified by lib_folder_guesses)
+    for files that match the required library extension (CLIB_EXT). Library file names are
+    canonicalized by removing the "lib" prefix from their stem. If a candidate directory contains
+    all of the required libraries (as specified in target_libs), the function returns a list of
+    absolute paths to these library files.
+    Parameters:
+        start (str or Path): The starting directory from which to begin the search.
+        target_libs (iterable of str): A collection of required library names (without the "lib" prefix).
+        lib_folder_guesses (iterable of str): Relative paths from an ancestor directory that may contain the libraries.
+    Returns:
+        list[str] or None: A list of resolved paths to the required library files if found; otherwise, None.
+    """
+    # Traverse through all parent directories of the resolved starting path.
+    for ancestor in Path(start).resolve().parents:
+        # Iterate over each candidate relative directory path.
+        for rel_path in lib_folder_guesses:
+            target_dir = ancestor / rel_path
+            # Skip if the candidate directory does not exist.
+            if not target_dir.is_dir():
+                continue
+            # Initialize a list to hold the resolved paths of matching library files.
+            libs_cand = []
+            # Create a set of the remaining libraries we need to find.
+            remaining_libs = set(target_libs)
+            # Iterate over all items in the candidate directory.
+            for p in target_dir.iterdir():
+                # Consider only files with the expected library extension.
+                if p.suffix == CLIB_EXT:
+                    # Canonicalize the library name by removing the "lib" prefix.
+                    lib_name = p.stem.removeprefix("lib")
+                    # If this library is required, add its resolved path and mark it as found.
+                    if lib_name in remaining_libs:
+                        libs_cand.append(str(p.resolve()))
+                        remaining_libs.remove(lib_name)
+            # If all required libraries have been found, return the list of library paths.
+            if len(remaining_libs) == 0:
+                return libs_cand
+    # Return None if no candidate directory contains all required libraries.
+    return None
+def _find_cuda_home():
+    """Find the CUDA installation path using a series of heuristic methods.
+    Methods below are checked in order, and the function returns on first match:
+    1. Checking the environment variables CUDA_HOME and CUDA_PATH.
+    2. Searching for the 'nvcc' compiler in the system PATH and deriving the path of cuda.
+    3. Scanning common installation directories based on the operating system.
+       - On Windows systems (when IS_WINDOWS is True), it searches in:
+             C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*
+       - On Unix-like systems, it searches in:
+             /usr/local/cuda*
+    Returns:
+        Optional[str]: The absolute CUDA installation path if found; otherwise, None.
+    Note:
+        The variable IS_WINDOWS is defined in the module scope.
+    """
+    # Guess #1
+    cuda_home = get_str_env_var("CUDA_HOME") or get_str_env_var("CUDA_PATH")
+    if cuda_home is None:
+        # Guess #2
+        nvcc_path = shutil.which("nvcc")
+        if nvcc_path is not None:
+            cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
+        else:
+            # Guess #3
+            if IS_WINDOWS:
+                glob_pat = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*"
+            else:
+                glob_pat = "/usr/local/cuda*"
+            cuda_homes = glob.glob(glob_pat)
+            if len(cuda_homes) == 0:
+                cuda_home = ""
+            else:
+                cuda_home = cuda_homes[0]
+            if not os.path.exists(cuda_home):
+                cuda_home = None
+    return cuda_home
+def get_cuda_toolkit_path():
+    """
+    Get cuda_toolkit_path. It returns get_str_env_var('CUDA_TOOLKIT_PATH') if
+    set. Otherwise, attempts to discover a valid CUDA toolkit location and
+    return. If not found, return None.
+    """
+    # Check if the environment variable is already set, if so, return it immediately.
+    try:
+        cuda_toolkit_path_existing = get_str_env_var("CUDA_TOOLKIT_PATH")
+        if cuda_toolkit_path_existing:
+            return cuda_toolkit_path_existing
+        found_cuda_home = _find_cuda_home()
+        if found_cuda_home:
+            return found_cuda_home
+    except Exception as e:
+        log().info("default_env: exception on get_cuda_toolkit_path", e)
+    return None
+def get_prefix_dsl_libs(prefix: str):
+    """
+    Returns get_str_env_var('{prefix}_LIBS') if set.
+    Otherwise, attempts to discover libs based on heuristics and return
+    If not found, return None.
+    """
+    # Check if the environment variable is already set, if so, return it immediately.
+    try:
+        prefix_libs_existing = get_str_env_var(f"{prefix}_LIBS")
+        if prefix_libs_existing:
+            return prefix_libs_existing
+        def get_libs_cand(start):
+            target_libs = {
+                "mlir_c_runner_utils",
+                "mlir_runner_utils",
+                "mlir_cuda_runtime",
+            }
+            lib_folder_guesses = [
+                "lib",
+            ]
+            libs_cand = find_libs_in_ancestors(start, target_libs, lib_folder_guesses)
+            if libs_cand:
+                dsl_libs = ":".join(libs_cand)
+                return dsl_libs
+            return None
+        # find from install folder
+        dsl_libs = get_libs_cand(__file__)
+        if not dsl_libs:
+            # try to find from build folder structure
+            dsl_libs = get_libs_cand(Path(__file__).parent.parent.resolve())
+        return dsl_libs
+    except Exception as e:
+        log().info(f"default_env: exception on get_prefix_dsl_libs", e)
+    return None
+class EnvironmentVarManager:
+    """Manages environment variables for configuration options.
+    Printing options:
+    - [DSL_NAME]_LOG_TO_CONSOLE: Print logging to stderr (default: False)
+    - [DSL_NAME]_PRINT_AFTER_PREPROCESSOR: Print after preprocess (default: False)
+    - [DSL_NAME]_PRINT_IR: Print generated IR (default: False)
+    - [DSL_NAME]_FILTER_STACKTRACE: Filter internal stacktrace (default: True)
+    File options:
+    - [DSL_NAME]_KEEP_IR: Save generated IR in a file (default: False)
+    - [DSL_NAME]_LOG_TO_FILE: Store all logging into a file, excluding COMPILE_LOGS (default: False)
+    Other options:
+    - [DSL_NAME]_LOG_LEVEL: Logging level to set, for LOG_TO_CONSOLE or LOG_TO_FILE (default: 1).
+    - [DSL_NAME]_DRYRUN: Generates IR only (default: False)
+    - [DSL_NAME]_ARCH: GPU architecture (default: "sm_100")
+    - [DSL_NAME]_WARNINGS_AS_ERRORS: Enable warnings as error (default: False)
+    - [DSL_NAME]_WARNINGS_IGNORE: Ignore warnings (default: False)
+    - [DSL_NAME]_ENABLE_OPTIMIZATION_WARNINGS: Enable warnings of optimization warnings (default: False)
+    - [DSL_NAME]_JIT_TIME_PROFILING: Whether or not to profile the IR generation/compilation/execution time (default: False)
+    - [DSL_NAME]_DISABLE_FILE_CACHING: Disable file caching (default: False)
+    - [DSL_NAME]_FILE_CACHING_CAPACITY: Limits the number of the cache save/load files (default: 1000)
+    - [DSL_NAME]_LIBS: Path to dependent shared libraries (default: None)
+    - [DSL_NAME]_NO_SOURCE_LOCATION: Generate source location (default: False)
+    """
+    def __init__(self, prefix="DSL"):
+        self.prefix = prefix  # change if needed
+        # Printing options
+        self.print_after_preprocessor = get_bool_env_var(
+            f"{prefix}_PRINT_AFTER_PREPROCESSOR", False
+        )
+        self.printIR = get_bool_env_var(f"{prefix}_PRINT_IR", False)
+        self.filterStacktrace = get_bool_env_var(f"{prefix}_FILTER_STACKTRACE", True)
+        # File options
+        self.keepIR = get_bool_env_var(f"{prefix}_KEEP_IR", False)
+        # Logging options
+        self.log_to_console = get_bool_env_var(f"{prefix}_LOG_TO_CONSOLE", False)
+        self.log_to_file = get_bool_env_var(f"{prefix}_LOG_TO_FILE", False)
+        if (
+            has_env_var(f"{prefix}_LOG_LEVEL")
+            and not self.log_to_console
+            and not self.log_to_file
+        ):
+            log().warning(
+                f"Log level was set, but neither logging to file ({prefix}_LOG_TO_FILE) nor logging to console ({prefix}_LOG_TO_CONSOLE) is enabled!"
+            )
+        self.log_level = get_int_env_var(f"{prefix}_LOG_LEVEL", 1)
+        # Other options
+        self.dryrun = get_bool_env_var(f"{prefix}_DRYRUN", False)
+        self.arch = get_str_env_var(f"{prefix}_ARCH", detect_gpu_arch(prefix))
+        self.warnings_as_errors = get_bool_env_var(
+            f"{prefix}_WARNINGS_AS_ERRORS", False
+        )
+        self.warnings_ignore = get_bool_env_var(f"{prefix}_WARNINGS_IGNORE", False)
+        self.enable_optimization_warnings = get_bool_env_var(
+            f"{prefix}_ENABLE_OPTIMIZATION_WARNINGS", False
+        )
+        self.jitTimeProfiling = get_bool_env_var(f"{prefix}_JIT_TIME_PROFILING", False)
+        self.disable_file_caching = get_bool_env_var(
+            f"{prefix}_DISABLE_FILE_CACHING", False
+        )
+        self.file_caching_capacity = get_int_env_var(
+            f"{prefix}_FILE_CACHING_CAPACITY", 1000
+        )
+        self.generate_source_location = not get_bool_env_var(
+            f"{prefix}_NO_SOURCE_LOCATION", False
+        )
+        # set cuda
+        self.cuda_toolkit = get_cuda_toolkit_path()
+        # set mlir shared libraries
+        self.shared_libs = get_prefix_dsl_libs(prefix)

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/jit_executor.py ADDED Viewed

	@@ -0,0 +1,357 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides jit executor related classes
+"""
+import ctypes
+import inspect
+import io
+from typing import get_origin
+import numpy as np
+# MLIR modules imports
+from .._mlir import ir
+# Local modules imports
+from . import typing as t
+from .common import DSLRuntimeError
+from .runtime import cuda as cuda_helpers
+from .runtime.jit_arg_adapters import JitArgAdapterRegistry, is_arg_spec_constexpr
+from .typing import get_c_pointers
+from .utils.logger import log
+from .utils.timer import timer
+class CudaSingleModule:
+    def __init__(self, cuda_module, kernel_ptr):
+        self.cuda_module = cuda_module
+        self.kernel_ptr = kernel_ptr
+class CudaModules:
+    def __init__(self, modules, args):
+        # list of CudaSingleModule
+        self.modules = modules
+        # extra kernel ptr arguments for launch
+        self.args = args
+class JitExecutor:
+    def __init__(
+        self,
+        dsl,
+        engine,
+        capi_func,
+        ir_module,
+        args_spec,
+        function_name,
+        cuda_modules: CudaModules = None,
+        jit_time_profiling=False,
+    ):
+        self.dsl = dsl
+        self.engine = engine
+        self.capi_func = capi_func
+        self.ir_module = ir_module
+        self.args_spec = args_spec
+        self.function_name = function_name
+        if args_spec is not None:
+            self.original_args_spec = args_spec
+            self.args_spec = self.filter_runtime_arg_spec(args_spec)
+        # cuda kernels
+        self.cuda_modules = cuda_modules
+        self.jit_time_profiling = jit_time_profiling
+    def filter_runtime_arg_spec(self, arg_spec: inspect.FullArgSpec):
+        runtime_args = []
+        runtime_annotations = {}
+        runtime_defaults = []
+        # Calculate the offset where defaults start in the original args
+        if arg_spec.defaults:
+            defaults_start_idx = len(arg_spec.args) - len(arg_spec.defaults)
+        else:
+            defaults_start_idx = len(arg_spec.args)
+        # Filter arguments and maintain their properties
+        for i, arg_name in enumerate(arg_spec.args):
+            arg_type = arg_spec.annotations.get(arg_name, None)
+            # Skip compile-time arguments
+            if is_arg_spec_constexpr(arg_type, arg_name, i, self.function_name):
+                continue
+            # Keep runtime arguments
+            runtime_args.append(arg_name)
+            if arg_name in arg_spec.annotations:
+                runtime_annotations[arg_name] = arg_type
+            # Keep corresponding default if it exists
+            if i >= defaults_start_idx:
+                default_idx = i - defaults_start_idx
+                runtime_defaults.append(arg_spec.defaults[default_idx])
+        # Filter kwonlyargs and their defaults
+        runtime_kwonlyargs = []
+        runtime_kwonlydefaults = {}
+        if arg_spec.kwonlyargs:
+            for kwarg in arg_spec.kwonlyargs:
+                arg_type = arg_spec.annotations.get(kwarg, None)
+                # Apply same filtering logic
+                if is_arg_spec_constexpr(arg_type, kwarg, i, self.function_name):
+                    continue
+                runtime_kwonlyargs.append(kwarg)
+                if kwarg in arg_spec.annotations:
+                    runtime_annotations[kwarg] = arg_type
+                if arg_spec.kwonlydefaults and kwarg in arg_spec.kwonlydefaults:
+                    runtime_kwonlydefaults[kwarg] = arg_spec.kwonlydefaults[kwarg]
+        # Convert runtime_defaults to tuple if not empty (as expected by FullArgSpec)
+        runtime_defaults = tuple(runtime_defaults) if runtime_defaults else None
+        return inspect.FullArgSpec(
+            args=runtime_args,
+            varargs=arg_spec.varargs,  # Keep original varargs
+            varkw=arg_spec.varkw,  # Keep original varkw
+            defaults=runtime_defaults,
+            kwonlyargs=runtime_kwonlyargs,
+            kwonlydefaults=runtime_kwonlydefaults if runtime_kwonlydefaults else None,
+            annotations=runtime_annotations,
+        )
+    def __del__(self):
+        if self.cuda_modules:
+            cuda_modules = [module.cuda_module for module in self.cuda_modules.modules]
+            for module in set(cuda_modules):
+                cuda_helpers.unload_cubin_module(module)
+    def get_constexpr_args(self) -> list[dict[str, int | str]]:
+        """
+        This function returns the constexpr args that have been pruned from the original function signature.
+        The return type is a list of dicts, each dict contains the argument index (argument_index) and argument name (argument_name).
+        :return: list of dicts, each dict contains the argument index (argument_index) and argument name (argument_name).
+        :rtype: list[dict[str, int | str]]
+        """
+        if self.original_args_spec is None:
+            return list()
+        constexpr_args = list()
+        for i, arg_name in enumerate(self.original_args_spec.args):
+            if arg_name not in self.args_spec.args:
+                constexpr_args.append({"argument_index": i, "argument_name": arg_name})
+        if self.original_args_spec.kwonlyargs:
+            for kwarg in self.original_args_spec.kwonlyargs:
+                if kwarg not in self.args_spec.kwonlyargs:
+                    constexpr_args.append(
+                        {"argument_index": None, "argument_name": kwarg}
+                    )
+        return constexpr_args
+    def generate_execution_args(self, args, kwargs, args_spec: inspect.FullArgSpec):
+        """
+        This function is the prune version of `generate_mlir_function_types` which only generates execution args
+        to get rid of mlir context.
+        """
+        # Process positional arguments with defaults
+        rectified_args = list(args)
+        if args_spec.defaults and len(args) < len(args_spec.args):
+            rectified_args.extend(args_spec.defaults[len(args) - len(args_spec.args) :])
+        for k, v in kwargs.items():
+            if k in args_spec.args:
+                idx = args_spec.args.index(k)
+                if idx < len(rectified_args):
+                    rectified_args[idx] = v
+                else:
+                    rectified_args.append(v)
+        # Process keyword arguments
+        rectified_kwargs = {k: v for k, v in kwargs.items() if k not in args_spec.args}
+        if args_spec.kwonlydefaults and len(rectified_kwargs) < len(
+            args_spec.kwonlyargs
+        ):
+            rectified_kwargs.update(args_spec.kwonlydefaults)
+        # args/kwargs must match arg_specs
+        if len(rectified_args) != len(args_spec.args) or len(rectified_kwargs) != len(
+            args_spec.kwonlyargs
+        ):
+            raise DSLRuntimeError(
+                "input args/kwargs length does not match runtime function signature!",
+                context={
+                    "input args length": len(rectified_args),
+                    "input kwargs length": len(rectified_kwargs),
+                    "function signature args length": len(args_spec.args),
+                    "function signature kwonlyargs length": len(args_spec.kwonlyargs),
+                },
+            )
+        exe_args = []
+        adapted_args = []
+        input_args = rectified_args + list(rectified_kwargs.values())
+        input_arg_names = args_spec.args + args_spec.kwonlyargs
+        for arg, arg_name in zip(input_args, input_arg_names):
+            # short-cut for args already converted
+            if hasattr(arg, "__c_pointers__"):
+                exe_args.extend(arg.__c_pointers__())
+                continue
+            arg_type = args_spec.annotations.get(arg_name, None)
+            # Implicit cast to NumericMeta
+            if isinstance(arg_type, t.NumericMeta):
+                arg = t.cast(arg, arg_type)
+            else:
+                # If not any known type, try registered adapter to do the conversion
+                adapter = JitArgAdapterRegistry.get_registered_adapter(type(arg))
+                if adapter:
+                    arg = adapter(arg)
+                    adapted_args.append(arg)
+            exe_args.extend(get_c_pointers(arg))
+        return exe_args, adapted_args
+    def __call__(self, *args, **kwargs):
+        exe_args, adapted_args = self.generate_execution_args(
+            args, kwargs, self.args_spec
+        )
+        self.run_compiled_program(exe_args)
+    # Assume each execution args has type `c_void_p` to reduce the overhead of `ctypes.cast`.
+    def get_invoke_packed_args(self, exe_args):
+        if self.cuda_modules:
+            exe_args += self.cuda_modules.args
+        packed_args = (ctypes.c_void_p * len(exe_args))()
+        for argNum in range(len(exe_args)):
+            packed_args[argNum] = exe_args[argNum]
+        return packed_args
+    def run_compiled_program(self, exe_args):
+        if self.jit_time_profiling:
+            profiler = timer(enable=True)
+            try:
+                packed_args = profiler(self.get_invoke_packed_args)(exe_args)
+                profiler(self.capi_func)(packed_args)
+            except Exception as e:
+                raise DSLRuntimeError(f"💥💥💥 Runtime Crash 💥💥💥", cause=e)
+        else:
+            try:
+                packed_args = self.get_invoke_packed_args(exe_args)
+                self.capi_func(packed_args)
+            except Exception as e:
+                raise DSLRuntimeError(f"💥💥💥 Runtime Crash 💥💥💥", cause=e)
+    def update_jit_cuda_modules(self, kernel_symbols):
+        # preload cuda module from compiled cubin in ir and store to jit_executor.kernels.
+        if len(kernel_symbols) > 0:
+            extra_args = []
+            module = self.ir_module
+            cuda_kernel_cache = dict()
+            cuda_driver_version = cuda_helpers.get_driver_version()
+            for sym in kernel_symbols:
+                if sym not in cuda_kernel_cache:
+                    log().debug(f"Loading CUDA module for symbol: {sym}")
+                    # load cuda module/get function pointer from module and cache
+                    def walk_callback(sym, func_sym, cubin_data):
+                        cubin_module = cuda_helpers.load_cubin_module_data(cubin_data)
+                        kernel_ptr = cuda_helpers.get_kernel_function(
+                            cubin_module, func_sym
+                        )
+                        # Enable non-portable cluster size for CUDA version 11.8 or higher.
+                        if cuda_driver_version >= 11080:
+                            cuda_helpers.set_kernel_attribute(
+                                kernel_ptr,
+                                cuda_helpers.cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
+                                1,
+                            )
+                        cuda_kernel_cache[sym] = CudaSingleModule(
+                            cubin_module, kernel_ptr
+                        )
+                    self.walk_module_and_get_cubin_data(module, sym, walk_callback)
+                else:
+                    log().debug(f"Symbol {sym} already in cache")
+                # check if kernel is empty.
+                if sym in cuda_kernel_cache:
+                    extra_args.append(
+                        ctypes.c_void_p(cuda_kernel_cache[sym].kernel_ptr.getPtr())
+                    )
+            # store to the jit result if jit result is cached.
+            self.cuda_modules = CudaModules(cuda_kernel_cache.values(), extra_args)
+        return self
+    def _get_escaped_cubin_bytes(self, cubin_data):
+        """This function escapes cubin data from mlir raw bytecode to executable binary bytes"""
+        def ishex(inp):
+            return (
+                inp in range(0x30, 0x3A)
+                or inp in range(0x61, 0x67)
+                or inp in range(0x41, 0x47)
+            )
+        converted = bytearray()
+        idx = 0
+        while idx < len(cubin_data):
+            # escape the original bytes
+            if cubin_data[idx] == 0x5C:
+                # if data of idx is b'\\'
+                if ishex(cubin_data[idx + 1]) and ishex(cubin_data[idx + 2]):
+                    converted += bytearray.fromhex(
+                        cubin_data[idx + 1 : idx + 3].decode()
+                    )
+                    idx += 3
+                elif cubin_data[idx + 1] == 0x5C:
+                    converted.append(cubin_data[idx])
+                    idx += 2
+            else:
+                # no escape, directly write
+                converted.append(cubin_data[idx])
+                idx += 1
+        return bytes(converted)
+    def walk_module_and_get_cubin_data(self, module, sym, callback):
+        """This function is used to walk gpu binary op, extract the cubin inside, and process cubin data with callback."""
+        def walk_gpu_binary_op(op):
+            if op.name != "gpu.binary":
+                return ir.WalkResult.ADVANCE
+            s = io.BytesIO()
+            op.write_bytecode(s)
+            cubin_data = s.getvalue()
+            if sym.encode() not in cubin_data:
+                return ir.WalkResult.ADVANCE
+            if (
+                "kernels" != op.opview.sym_name.value
+                and sym != op.opview.sym_name.value
+            ):
+                return ir.WalkResult.ADVANCE
+            # function symbol of kernel(gpu.launch_func) is equal to sym name in mlir
+            func_sym = sym
+            if sym == op.opview.sym_name.value and not sym.endswith("_kernel"):
+                func_sym = sym.rsplit("_", 1)[0]
+            cubin_data = cubin_data.split(b'bin = "')[1].split(b'">')[0]
+            cubin_data = self._get_escaped_cubin_bytes(cubin_data)
+            callback(sym, func_sym, cubin_data)
+            return ir.WalkResult.ADVANCE
+        module.operation.walk(walk_gpu_binary_op)

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides a runtime utility functions that are needed for
+the DSL.
+"""
+from . import dlpack_types
+from . import cuda
+from . import jit_arg_adapters
+__all__ = [
+    "dlpack_types",
+    "cuda",
+    "jit_arg_adapters",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/cuda.py ADDED Viewed

	@@ -0,0 +1,476 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides CUDA Python helper functions
+"""
+from functools import lru_cache
+from dataclasses import dataclass
+from typing import List, Optional
+import numpy as np
+import os
+import ctypes
+import cuda.bindings.driver as cuda
+import cuda.bindings.nvrtc as nvrtc
+# MLIR imports
+from ..._mlir import ir
+from ..._mlir.dialects import gpu
+# Local module imports
+from ..utils.logger import log as _log
+from ..common import *
+from .jit_arg_adapters import JitArgAdapterRegistry
+# =============================================================================
+# Utils
+# =============================================================================
+def _cudaGetErrorEnum(error):
+    if isinstance(error, cuda.CUresult):
+        err, name = cuda.cuGetErrorName(error)
+        return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
+    elif isinstance(error, nvrtc.nvrtcResult):
+        return nvrtc.nvrtcGetErrorString(error)[1]
+    else:
+        raise DSLRuntimeError("Unknown error type: {}".format(error))
+def _get_gpu_arch_info(major, minor):
+    """Get GPU architecture information and compatibility details."""
+    gpu_arch_map = {
+        (7, 0): ("Volta", "sm_70", ["sm_70"]),  # V100
+        (7, 5): ("Turing", "sm_75", ["sm_75"]),  # RTX 20 Series, Quadro RTX
+        (8, 0): ("Ampere", "sm_80", ["sm_80"]),  # A100
+        (8, 6): ("Ampere", "sm_86", ["sm_86", "sm_80"]),  # RTX 30 Series
+        (8, 9): ("Ada", "sm_89", ["sm_89", "sm_86"]),  # RTX 40 Series
+        (8, 7): ("Ampere", "sm_87", ["sm_87", "sm_86", "sm_80"]),  # A10, A40
+        (9, 0): ("Hopper", "sm_90a", ["sm_90a"]),  # H100
+        (10, 0): ("Blackwell", "sm_100a", ["sm_100a"]),  # B200
+    }
+    return gpu_arch_map.get(
+        (major, minor), ("Unknown", f"sm_{major}{minor}", [f"sm_{major}{minor}"])
+    )
+def get_compute_capability_major_minor(device_id: int = 0):
+    """
+    Returns the compute capability of the CUDA device as a tuple of (major, minor).
+    For example: (8, 0) for Ampere, (9, 0) for Hopper, (10, 0) for Blackwell.
+    Returns None on failure.
+    """
+    try:
+        checkCudaErrors(cuda.cuInit(0))
+        device = checkCudaErrors(cuda.cuDeviceGet(device_id))
+        major = checkCudaErrors(
+            cuda.cuDeviceGetAttribute(
+                cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                device,
+            )
+        )
+        minor = checkCudaErrors(
+            cuda.cuDeviceGetAttribute(
+                cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                device,
+            )
+        )
+        return major, minor
+    except RuntimeError as e:
+        _log().info(f"Failed to get CUDA compute capability: {e}")
+        return None, None
+@dataclass
+class DeviceInfo:
+    """Data class to store CUDA device information."""
+    device_count: int = 0
+    current_device: int = 0
+    device_name: Optional[str] = None
+    major_version: Optional[int] = None
+    minor_version: Optional[int] = None
+    arch_name: Optional[str] = None
+    sm_arch: Optional[str] = None
+    compatible_archs: Optional[List[str]] = None
+    memory_gb: Optional[float] = None
+    target_arch: Optional[str] = None
+    error_message: Optional[str] = None
+    initialization_failed: bool = False
+    def pretty_str(self) -> str:
+        """
+        Convert DeviceInfo to a formatted string for display.
+        """
+        info = ""
+        if self.initialization_failed:
+            return f"{Colors.BOLD}- CUDA initialization failed{Colors.RESET}"
+        if self.error_message:
+            return f"{Colors.BOLD}- Failed to get GPU info: {self.error_message}{Colors.RESET}"
+        if self.device_count > 0:
+            info += f"{Colors.BOLD}- CUDA devices available: {self.device_count} (current: {self.current_device})\n"
+            if self.major_version is not None and self.minor_version is not None:
+                info += f"- Architecture: {Colors.BLUE}{self.arch_name}{Colors.RESET} ({Colors.GREEN}{self.sm_arch}{Colors.RESET})\n"
+                info += f"- Compatible SM archs: {Colors.GREEN}{', '.join(self.compatible_archs or [])}{Colors.RESET}\n"
+                if self.memory_gb is not None:
+                    info += f"- Total Memory: {Colors.BLUE}{self.memory_gb:.2f} GB{Colors.RESET}\n"
+            else:
+                info += f"- Compute capability: unknown\n"
+                info += f"- SM arch: unknown{Colors.RESET}\n"
+        else:
+            info += f"- No devices available\n"
+        return info
+def get_device_info() -> DeviceInfo:
+    """
+    Get detailed information about CUDA devices.
+    Returns a DeviceInfo dataclass with device information.
+    """
+    device_info = DeviceInfo()
+    # Initialize CUDA if not already initialized
+    try:
+        result = cuda.cuInit(0)
+        if result[0].value:  # Check for error
+            device_info.initialization_failed = True
+            return device_info
+    except:
+        pass
+    try:
+        # Get device count
+        result = cuda.cuDeviceGetCount()
+        device_info.device_count = result[1] if result[0].value == 0 else 0
+        if device_info.device_count > 0:
+            # Get current device
+            try:
+                result = cuda.cuCtxGetDevice()
+                if result[0].value == 0:
+                    device_info.current_device = result[1]
+            except:
+                pass
+            # Get device name
+            try:
+                name_result = cuda.cuDeviceGetName(100, device_info.current_device)
+                if name_result[0].value == 0:
+                    device_info.device_name = name_result[1]
+            except:
+                pass
+            # Get compute capability and architecture info
+            try:
+                major, minor = get_compute_capability_major_minor(
+                    device_info.current_device
+                )
+                # Check if we successfully got the compute capability
+                if major is not None and minor is not None:
+                    device_info.major_version = major
+                    device_info.minor_version = minor
+                    arch_name, sm_arch, compatible_archs = _get_gpu_arch_info(
+                        device_info.major_version, device_info.minor_version
+                    )
+                    device_info.arch_name = arch_name
+                    device_info.sm_arch = sm_arch
+                    device_info.compatible_archs = compatible_archs
+                    # Get memory info
+                    try:
+                        total_mem = cuda.cuDeviceGetAttribute(
+                            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_TOTAL_MEMORY,
+                            device_info.current_device,
+                        )
+                        if total_mem[0].value == 0:
+                            device_info.memory_gb = total_mem[1] / (
+                                1024 * 1024 * 1024
+                            )  # Convert to GB
+                    except:
+                        pass
+            except Exception as e:
+                pass  # Compute capability info will remain None
+    except Exception as e:
+        device_info.error_message = str(e)
+    return device_info
+def checkCudaErrors(result):
+    """Check CUDA errors and provide detailed error messages."""
+    if result[0].value:
+        error_code = result[0].value
+        error_name = _cudaGetErrorEnum(result[0])
+        raise DSLCudaRuntimeError(error_code, error_name)
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
+# =============================================================================
+# Driver Helpers
+# =============================================================================
+@lru_cache(maxsize=1)
+def initialize_cuda_context(device_id: int = 0, flags: int = 0):
+    """
+    Initializes the CUDA context for a specified device.
+    """
+    # Initialize CUDA Driver API
+    _log().info(f"cuInit {flags}")
+    checkCudaErrors(cuda.cuInit(flags))
+    # Retrieve handle for device
+    _log().info(f"cuDeviceGet {device_id}")
+    cuDevice = checkCudaErrors(cuda.cuDeviceGet(device_id))
+    _log().info(f"{cuDevice} <-- cuDeviceGet")
+    # Create context
+    _log().info(f"cuCtxCreate {0} {cuDevice}")
+    if cuda.CUDA_VERSION >= 13000:
+        # Use cuCtxCreate_v4 API with explicit CUctxCreateParams None, since v2
+        # and v3 API has been removed from CTK 13.
+        # See https://github.com/NVIDIA/cuda-python/pull/792
+        context = checkCudaErrors(cuda.cuCtxCreate(None, 0, cuDevice))
+    else:
+        context = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice))
+    _log().info(f"{context} <-- cuCtxCreate")
+    return context
+def load_cubin_module(cubin_file):
+    """
+    Loads a CUBIN file and returns the module.
+    """
+    # Load CUBIN file as binary data
+    _log().info(f"read cubin {cubin_file}")
+    with open(cubin_file, "rb") as f:
+        cubin_data = f.read()
+    # Load module data
+    _log().info(f"cuModuleLoadData {np.char.array(cubin_data).ctypes.data}")
+    module = checkCudaErrors(
+        cuda.cuModuleLoadData(np.char.array(cubin_data).ctypes.data)
+    )
+    return module
+def unload_cubin_module(module):
+    """
+    Unloads a CUBIN module.
+    """
+    _log().info(f"cuModuleUnload {module}")
+    checkCudaErrors(cuda.cuModuleUnload(module))
+def load_cubin_module_data(cubin_data):
+    """
+    Loads a CUBIN from data and returns the module.
+    """
+    # Load module data
+    _log().info(f"cuModuleLoadData {np.char.array(cubin_data).ctypes.data}")
+    module = checkCudaErrors(
+        cuda.cuModuleLoadData(np.char.array(cubin_data).ctypes.data)
+    )
+    return module
+def get_kernel_function(module, kernel_name):
+    """
+    Retrieves the kernel function from the module.
+    """
+    _log().info(f"cuModuleGetFunction {module} {kernel_name}")
+    kernel = checkCudaErrors(
+        cuda.cuModuleGetFunction(module, bytes(kernel_name, "utf-8"))
+    )
+    _log().info(f"{kernel} <-- cuModuleGetFunction")
+    return kernel
+def launch_kernel(kernel, grid_dims, block_dims, stream, smem_size, kernel_args=None):
+    """
+    Launches the CUDA kernel.
+    """
+    _log().info(
+        f"cuLaunchKernel {kernel} grid={grid_dims} blocks={block_dims} smem_size={smem_size} stream={stream} {kernel_args}"
+    )
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            kernel,
+            grid_dims[0],
+            grid_dims[1],
+            grid_dims[2],
+            block_dims[0],
+            block_dims[1],
+            block_dims[2],
+            smem_size,  # Shared memory size
+            stream,
+            kernel_args,
+            0,  # Extra parameters
+        )
+    )
+def stream_sync(stream):
+    """
+    Synchronizes the CUDA stream.
+    """
+    _log().info(f"cuStreamSynchronize {stream}")
+    checkCudaErrors(cuda.cuStreamSynchronize(stream))
+def stream_create(id=0):
+    """
+    Creates the CUDA stream.
+    """
+    _log().info(f"cuStreamCreate {id}")
+    stream = checkCudaErrors(cuda.cuStreamCreate(id))
+    _log().info(f"{stream} <-- cuStreamCreate")
+    return stream
+def stream_destroy(stream):
+    """
+    Destroys the CUDA stream.
+    """
+    _log().info(f"cuStreamDestroy {stream}")
+    checkCudaErrors(cuda.cuStreamDestroy(stream))
+def context_destroy(context):
+    """
+    Destroys the CUDA context.
+    """
+    _log().info(f"cuCtxDestroy {context}")
+    checkCudaErrors(cuda.cuCtxDestroy(context))
+def allocate(size_in_bytes: int, stream=None):
+    """
+    Allocate device memory based on numpy host array size.
+    """
+    _log().info("Allocate size_in_bytes=[%s] stream=[%s]", size_in_bytes, stream)
+    if stream is None:
+        device_memory = checkCudaErrors(cuda.cuMemAlloc(size_in_bytes))
+    else:
+        device_memory = checkCudaErrors(cuda.cuMemAllocAsync(size_in_bytes, stream))
+    _log().info("Allocated [%s]", device_memory)
+    return device_memory
+def deallocate(device_pointer, stream=None):
+    """
+    Deallocate the specified device memory pointer.
+    """
+    _log().info(
+        "Deallocate device_pointer=[%s] stream=[%s]", hex(int(device_pointer)), stream
+    )
+    if stream is None:
+        checkCudaErrors(cuda.cuMemFree(device_pointer))
+    else:
+        checkCudaErrors(cuda.cuMemFreeAsync(device_pointer, stream))
+def memcpy_h2d(host_pointer, device_pointer, size_in_bytes, stream=None):
+    """
+    Copy data from host to device memory.
+    """
+    _log().info(
+        "Copy host-to-device host_pointer[%s] device_ptr=[%s] size_in_bytes=[%s] stream=[%s]",
+        hex(host_pointer),
+        hex(int(device_pointer)),
+        size_in_bytes,
+        stream,
+    )
+    if stream is None:
+        checkCudaErrors(cuda.cuMemcpyHtoD(device_pointer, host_pointer, size_in_bytes))
+    else:
+        checkCudaErrors(
+            cuda.cuMemcpyHtoDAsync(device_pointer, host_pointer, size_in_bytes, stream)
+        )
+def memcpy_d2h(host_pointer, device_pointer, size_in_bytes, stream=None):
+    """
+    Copy data from device to host memory.
+    """
+    _log().info(
+        "Copy device-host-to device_pointer=[%s] host_pointer[%s]  size_in_bytes=[%s] stream=[%s]",
+        hex(int(device_pointer)),
+        hex(host_pointer),
+        size_in_bytes,
+        stream,
+    )
+    if stream is None:
+        checkCudaErrors(cuda.cuMemcpyDtoH(host_pointer, device_pointer, size_in_bytes))
+    else:
+        checkCudaErrors(
+            cuda.cuMemcpyDtoHAsync(host_pointer, device_pointer, size_in_bytes, stream)
+        )
+def default_stream():
+    return cuda.CUstream(0)
+def get_driver_version():
+    """
+    Returns the CUDA driver version.
+    """
+    return checkCudaErrors(cuda.cuDriverGetVersion())
+def set_kernel_attribute(kernel, attribute, value):
+    """
+    Sets a CUDA kernel attribute.
+    """
+    return checkCudaErrors(cuda.cuFuncSetAttribute(kernel, attribute, value))
+@JitArgAdapterRegistry.register_jit_arg_adapter(cuda.CUstream)
+class StreamAdapter:
+    """
+    Convert a CUDA stream to a stream representation for JIT arg generation.
+    """
+    def __init__(self, arg):
+        self._arg = arg
+        self._c_pointer = self._arg.getPtr()
+    def __new_from_mlir_values__(self, values):
+        assert len(values) == 1
+        return values[0]
+    def __c_pointers__(self):
+        return [self._c_pointer]
+    def __get_mlir_types__(self):
+        return [gpu.AsyncTokenType.get()]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/device_tensor.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+import copy
+from . import cuda as cuda_helpers
+from .tensor_descriptor import *
+from ..common import *
+def allocate(tensor: TensorDescriptor, stream=None):
+    """
+    Allocates GPU memory
+    """
+    if tensor._check_is_managed_by_framework():
+        raise DSLRuntimeError(
+            "GPU tensors are managed by the framework and cannot be modified."
+        )
+    if not tensor.device_pointer is None:
+        raise DSLRuntimeError("Tensor is already allocated on the device.")
+    tensor.device_pointer = cuda_helpers.allocate(tensor.size_in_bytes, stream)
+    log().info("Allocate done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+def deallocate(tensor: TensorDescriptor, stream=None):
+    """
+    Deallocates GPU memory
+    """
+    if tensor._check_is_managed_by_framework():
+        raise DSLRuntimeError(
+            "GPU tensors are managed by the framework and cannot be modified."
+        )
+    if tensor.device_pointer is None:
+        raise DSLRuntimeError("Tensor is not allocated on the device.")
+    log().info(
+        "Deallocating done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer
+    )
+    cuda_helpers.deallocate(tensor.device_pointer, stream)
+    tensor.device_pointer = None
+def copy_to_gpu(tensor: TensorDescriptor, do_allocate=True, stream=None):
+    """
+    Copies data from host memory to the GPU memory.
+    If do_allocate is True, it first calls allocate
+    """
+    log().info("copyin tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+    if do_allocate:
+        allocate(tensor, stream)
+    cuda_helpers.memcpy_h2d(
+        tensor.data_ptr, tensor.device_pointer, tensor.size_in_bytes, stream
+    )
+    log().info("copyin done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+    return tensor
+def copy_from_gpu(tensor: TensorDescriptor, do_deallocate=True, stream=None):
+    """
+    Copies data from GPU memory back to the host.
+    If do_deallocate is True, it calls deallocate
+    """
+    log().info("copyout tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+    if tensor._check_is_managed_by_framework():
+        raise DSLRuntimeError(
+            "GPU tensors are managed by the framework and cannot be modified."
+        )
+    if tensor.device_pointer is None:
+        raise DSLRuntimeError("Tensor is not allocated on the device.")
+    cuda_helpers.memcpy_d2h(
+        tensor.data_ptr, tensor.device_pointer, tensor.size_in_bytes, stream
+    )
+    if do_deallocate:
+        deallocate(tensor, stream)
+    log().info("copyout done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+def to_gpu(tensor, stream=None) -> TensorDescriptor:
+    """
+    Copies the tensor to the GPU memory from Host memory
+    """
+    if isinstance(tensor, TensorDescriptor):
+        new_tensor = copy.copy(tensor)
+        copy_to_gpu(new_tensor, stream=stream)
+        return new_tensor
+    if TensorDescriptor.can_transformed_to_dlpack(tensor):
+        new_tensor = TensorDescriptor(tensor)
+        copy_to_gpu(new_tensor, stream=stream)
+        return new_tensor
+    raise DSLRuntimeError("Unsupported type")
+def from_gpu(tensor, stream=None) -> TensorDescriptor:
+    """
+    Copies the tensor to the GPU memory from Host memory
+    """
+    if isinstance(tensor, TensorDescriptor):
+        new_tensor = copy.copy(tensor)
+        copy_from_gpu(new_tensor, stream=stream)
+        return new_tensor
+    if TensorDescriptor.can_transformed_to_dlpack(tensor):
+        new_tensor = TensorDescriptor(tensor)
+        copy_from_gpu(new_tensor, stream=stream)
+        return new_tensor
+    raise DSLRuntimeError("Unsupported type")

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/dlpack_types.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides helper structs for dlpack.
+DLPack is an open standard for in-memory tensor structures, enabling
+seamless sharing of tensors across different frameworks.
+Learn more at: https://github.com/dmlc/dlpack
+"""
+import ctypes
+import enum
+class DLDeviceType(enum.IntEnum):
+    """Enums for device types based on the DLPack specification."""
+    kDLCPU = 1
+    kDLGPU = 2
+    kDLCPUPinned = 3
+class DLDataTypeCode:
+    """Enums for data type codes based on the DLPack specification.
+    see https://github.com/dmlc/dlpack/blob/main/include/dlpack/dlpack.h
+    """
+    kDLInt = 0
+    kDLUInt = 1
+    kDLFloat = 2
+    kDLOpaqueHandle = 3
+    kDLBfloat = 4
+    kDLComplex = 5
+    kDLBool = 6
+class DLDevice(ctypes.Structure):
+    """Structure representing the device information in DLPack."""
+    _fields_ = [
+        ("device_type", ctypes.c_int),  # kDLCPU, kDLGPU, etc.
+        ("device_id", ctypes.c_int),  # Device ID (e.g., GPU ID)
+    ]
+class DLDataType(ctypes.Structure):
+    """Structure representing the data type in DLPack."""
+    _fields_ = [
+        ("code", ctypes.c_uint8),  # Data type code (e.g., kDLFloat)
+        ("bits", ctypes.c_uint8),  # Number of bits per value
+        ("lanes", ctypes.c_uint16),  # Number of lanes
+    ]
+class DLTensor(ctypes.Structure):
+    """Structure representing the DLTensor in DLPack."""
+    _fields_ = [
+        ("data", ctypes.c_void_p),  # Pointer to tensor data
+        ("device", DLDevice),  # Device info
+        ("ndim", ctypes.c_int),  # Number of dimensions
+        ("dtype", DLDataType),  # Data type
+        ("shape", ctypes.POINTER(ctypes.c_int64)),  # Shape of tensor
+        ("strides", ctypes.POINTER(ctypes.c_int64)),  # Strides of tensor
+        ("byte_offset", ctypes.c_uint64),  # Byte offset to tensor data
+    ]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides runtime utilities for JIT argument conversion in DSL.
+"""
+from functools import wraps
+from typing import get_origin
+# Local modules imports
+from ..common import DSLRuntimeError
+from ..typing import (
+    Constexpr,
+    Int32,
+    Float32,
+    Boolean,
+)
+def is_arg_spec_constexpr(arg_spec, arg_name, arg_index, owning_func):
+    """
+    Check if the argument spec is a constexpr.
+    """
+    def _is_reserved_python_func_arg(arg_index, arg_name, func):
+        """
+        Check if the argument is a reserved python function argument.
+        """
+        if arg_index != 0:
+            return False
+        if arg_name == "self":
+            return True
+        is_classmethod = isinstance(func, classmethod) or (
+            hasattr(func, "__func__") and isinstance(func.__func__, classmethod)
+        )
+        return arg_name == "cls" and is_classmethod
+    return (
+        _is_reserved_python_func_arg(arg_index, arg_name, owning_func)
+        or (isinstance(arg_spec, type) and issubclass(arg_spec, Constexpr))
+        or (get_origin(arg_spec) is Constexpr)
+    )
+def is_argument_constexpr(arg, arg_spec, arg_name, arg_index, owning_func):
+    """
+    Check if the argument is a constexpr.
+    """
+    def _is_type_argument(arg, arg_annotation):
+        """
+        Check if the argument is a type argument like Type[X]
+        """
+        return isinstance(arg, type) and (
+            arg_annotation is None or get_origin(arg_annotation) is type
+        )
+    return (
+        is_arg_spec_constexpr(arg_spec, arg_name, arg_index, owning_func)
+        or _is_type_argument(arg, arg_spec)
+        or arg is None
+    )
+class JitArgAdapterRegistry:
+    """
+    A registry to keep track of the JIT argument adapters.
+    An adapter is a callable that converts a Python type to a type with following protocols supported:
+    - JitArgument
+    - DynamicExpression
+    The converted type can then be further processed by DSL to generate arguments for JIT functions.
+    """
+    # A dictionary with key=type and value=callable
+    jit_arg_adapter_registry = {}
+    @classmethod
+    def register_jit_arg_adapter(cls, *dargs, **dkwargs):
+        """
+        Register a JIT argument adapter callable
+        This can be used as a decorator on any callable like:
+        @register_jit_arg_adapter(my_py_type)
+        def my_adapter_for_my_py_type(arg):
+            ...
+        @register_jit_arg_adapter(my_py_type)
+        class MyAdapterForMyPythonType:
+            ...
+        The adapters are registered per type. If a type is already registerd, an error will be raised.
+        """
+        def decorator(*dargs, **dkwargs):
+            darg_python_ty = dargs[0]
+            @wraps(darg_python_ty)
+            def wrapper(*args, **kwargs):
+                if len(args) != 1 or not callable(args[0]):
+                    raise DSLRuntimeError(
+                        "a callable must be provided for registering JIT argument adapter"
+                    )
+                adapter = args[0]
+                if darg_python_ty in cls.jit_arg_adapter_registry:
+                    raise DSLRuntimeError(
+                        f"JIT argument adapter for {darg_python_ty} is already registered!",
+                        context={
+                            "Registered adapter": cls.jit_arg_adapter_registry[
+                                darg_python_ty
+                            ],
+                            "Adapter to be registered": adapter,
+                        },
+                    )
+                cls.jit_arg_adapter_registry[darg_python_ty] = adapter
+                return adapter
+            return wrapper
+        if len(dargs) > 0:
+            return decorator(*dargs, **dkwargs)
+        else:
+            raise DSLRuntimeError(
+                "a Python type must be provided for registering JIT argument adapter"
+            )
+    @classmethod
+    def get_registered_adapter(cls, ty):
+        """
+        Get the registered JIT argument adapter for the given type.
+        """
+        return cls.jit_arg_adapter_registry.get(ty, None)
+# =============================================================================
+# JIT Argument Adapters
+# =============================================================================
+@JitArgAdapterRegistry.register_jit_arg_adapter(int)
+@JitArgAdapterRegistry.register_jit_arg_adapter(float)
+@JitArgAdapterRegistry.register_jit_arg_adapter(bool)
+def _convert_python_scalar(arg):
+    """
+    Convert a Python scalar to a DSL type.
+    """
+    conversion_map = {
+        int: Int32,
+        float: Float32,
+        bool: Boolean,
+    }
+    return conversion_map.get(type(arg))(arg)
+@JitArgAdapterRegistry.register_jit_arg_adapter(tuple)
+@JitArgAdapterRegistry.register_jit_arg_adapter(list)
+def _convert_python_sequence(arg):
+    """
+    Go through each element in the sequence and convert it to a type that can be
+    further processed by DSL to generate the corresponding JIT argument(s).
+    """
+    adapted_arg = []
+    for elem in arg:
+        adapter = JitArgAdapterRegistry.get_registered_adapter(type(elem))
+        if adapter is not None:
+            converted_elem = adapter(elem)
+            adapted_arg.append(converted_elem)
+        else:
+            # If no registered adapter is found, just return the original element
+            adapted_arg.append(elem)
+    assert len(adapted_arg) == len(arg)
+    return type(arg)(adapted_arg)

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+# Helpers
+import itertools, operator
+import ctypes
+from . import dlpack_types as _dpack
+from .dlpack_runtime import (
+    dlpack_to_tensor_desc,
+    get_tensor_desc_data_ptr,
+    get_tensor_desc_is_in_device,
+    get_tensor_desc_element_type,
+    get_tensor_desc_shape,
+    get_tensor_desc_stride,
+    get_tensor_desc_element_size_in_bytes,
+    get_tensor_desc_ndim,
+    get_tensor_desc_dtype_code,
+    get_tensor_desc_dtype_bits,
+    get_tensor_desc_device_type,
+    get_tensor_desc_device_id,
+)
+from ..utils.logger import log
+from ..common import *
+from ..typing import (
+    Boolean,
+    Float8E5M2,
+    Int64,
+    Int32,
+    Int16,
+    Int8,
+    Uint64,
+    Uint32,
+    Uint16,
+    Uint8,
+    Float64,
+    Float32,
+    Float16,
+    BFloat16,
+)
+class TensorDescriptor:
+    def __init__(self, tensor):
+        """Initialize with a tensor that supports the DLPack protocol.
+        Args:
+            tensor: Any tensor object that implements __dlpack__ and __dlpack_device__
+        """
+        self.tensor = tensor
+        self._capsule = dlpack_to_tensor_desc(tensor)
+        self.data_ptr = get_tensor_desc_data_ptr(self._capsule)
+        self.device_type = get_tensor_desc_device_type(self._capsule)
+        self.device_type = _dpack.DLDeviceType(self.device_type)
+        if self.device_type == _dpack.DLDeviceType.kDLGPU:
+            self.device_pointer = self.data_ptr
+        elif self.device_type == _dpack.DLDeviceType.kDLCPU:
+            self.device_pointer = None
+        else:
+            raise DSLRuntimeError(
+                f"DLPack device type is not supported {self.dl_tensor.device.device_type}"
+            )
+        log().info("TensorDescriptor is created = [%s]", self)
+    @staticmethod
+    def can_transformed_to_dlpack(dl_tensor):
+        if not hasattr(dl_tensor, "__dlpack__") or not hasattr(
+            dl_tensor, "__dlpack_device__"
+        ):
+            return False
+        return True
+    @property
+    def is_in_device(self):
+        """Check if the tensor is stored on a device."""
+        return not self.device_pointer is None
+    @property
+    def device_id(self):
+        """Return device id where tensor resides."""
+        if self.is_in_device:
+            return get_tensor_desc_device_id(self._capsule)
+        return -1
+    @property
+    def element_type(self):
+        """Return the corresponding Python type based on DLPack dtype metadata."""
+        str_element_type = get_tensor_desc_element_type(self._capsule)
+        dtype_map = {
+            # bool is 8bit from numpy and torch
+            "Bool": Boolean,
+            "Int64": Int64,
+            "Int32": Int32,
+            "Int16": Int16,
+            "Int8": Int8,
+            "UInt64": Uint64,
+            "UInt32": Uint32,
+            "UInt16": Uint16,
+            "UInt8": Uint8,
+            "Float64": Float64,
+            "Float32": Float32,
+            "Float16": Float16,
+            "BFloat16": BFloat16,
+            "Float8E5M2": Float8E5M2,
+        }
+        if str_element_type not in dtype_map:
+            raise KeyError(
+                f"Unsupported element type in dlpack: '{str_element_type}'. Supported types are: {list(dtype_map.keys())}"
+            )
+        return dtype_map[str_element_type]
+    @property
+    def shape(self):
+        """Return the shape of the tensor."""
+        return get_tensor_desc_shape(self._capsule)
+    @property
+    def rank(self):
+        """Return the rank of the tensor."""
+        return get_tensor_desc_ndim(self._capsule)
+    @property
+    def strides(self):
+        """Return the rank of the tensor."""
+        return get_tensor_desc_stride(self._capsule)
+    @property
+    def element_size_in_bytes(self):
+        """Calculate the element size in bytes of the DLPack tensor."""
+        return get_tensor_desc_element_size_in_bytes(self._capsule)
+    @property
+    def size_in_bytes(self):
+        """Calculate the total size in bytes of the DLPack tensor."""
+        # Calculate the number of elements using the shape
+        ndim = get_tensor_desc_ndim(self._capsule)
+        shape = get_tensor_desc_shape(self._capsule)
+        num_elements = 1
+        for i in range(ndim):
+            num_elements *= shape[i]
+        # Total bytes
+        total_bytes = self.element_size_in_bytes * num_elements
+        return total_bytes
+    def __str__(self):
+        """Return a compact string representation of the device_tensor with a tensor prefix."""
+        # Extract shape
+        shape = "x".join(map(str, self.shape))
+        # Extract dtype
+        dtype_code = get_tensor_desc_dtype_code(self._capsule)
+        dtype_bits = get_tensor_desc_dtype_bits(self._capsule)
+        dtype = (
+            f"i{dtype_bits}"
+            if dtype_code == _dpack.DLDataTypeCode.kDLInt
+            else f"f{dtype_bits}"
+        )
+        # Extract device
+        device_type = "cpu" if not self.is_in_device else "gpu"
+        return f"tensor<{shape}x{dtype}>_{device_type}"
+    def _check_is_managed_by_framework(self):
+        """
+        Ensure the tensor is not managed by the framework (e.g., GPU tensor).
+        Raises an exception if the tensor is framework-managed.
+        """
+        return self.device_type == _dpack.DLDeviceType.kDLGPU
+    @staticmethod
+    def is_compatible(maybe_tensor_descriptor) -> bool:
+        """Check if the object is a TensorDescriptor or can be converted to one."""
+        return isinstance(
+            maybe_tensor_descriptor, TensorDescriptor
+        ) or TensorDescriptor.can_transformed_to_dlpack(maybe_tensor_descriptor)
+def from_tensor(tensor) -> TensorDescriptor:
+    """Create a TensorDescriptor from a tensor object."""
+    return TensorDescriptor(tensor)
+def to_tensor(tensor_descriptor: TensorDescriptor):
+    """Return tensor object from tensor descriptor."""
+    return tensor_descriptor.tensor

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/typing.py ADDED Viewed

	@@ -0,0 +1,1962 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+import ctypes
+import numpy as np
+import operator
+from typing_extensions import deprecated
+from functools import reduce
+from typing import (
+    Generic,
+    Protocol,
+    Union,
+    Any,
+    List,
+    Type,
+    TypeVar,
+    overload,
+    runtime_checkable,
+    get_origin,
+)
+from types import FunctionType
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+from .common import *
+from .ast_helpers import const_expr
+from ._mlir_helpers import arith as arith_helper, lru_cache_ir
+from ._mlir_helpers.arith import ArithValue
+from .._mlir import ir
+from .._mlir.extras import types as T
+from .._mlir.dialects import arith, math
+# =============================================================================
+# Dynamic Expression Protocol
+# =============================================================================
+@runtime_checkable
+class DynamicExpression(Protocol):
+    """Protocol defining the interface for object holding dynamic values in the DSL.
+    This protocol enables classes to represent dynamic values in the DSL. Classes implementing
+    this protocol can be used in JIT-compiled functions and dynamic value generation.
+    It is required for custom data types to work correctly with following JIT features:
+    * as function argument to call another JIT function from JIT function
+    * as return value from JIT function
+    * for constructions like if-else, while-loop, etc.
+    :param value: The MLIR operation result value to initialize the object with
+    :type value: ir.Value
+    **Required Methods**
+    * ``__extract_mlir_values__``: Extract MLIR values from the object
+    * ``__new_from_mlir_values__``: Create new instance from MLIR values
+    **Implementation Example**
+    To implement a custom data type that works with the DSL:
+    .. code-block:: python
+        class CustomData(metaclass=DslType):
+            def __init__(self, int_value):
+                self.int_value = int_value
+            def __extract_mlir_values__(self):
+                return [self.int_value]
+            def __new_from_mlir_values__(self, values):
+                return CustomData(values[0])
+    **Usage in JIT Functions**
+    When used in JIT-compiled functions, the DSL automatically extracts MLIR values:
+    .. code-block:: python
+        @jit
+        def caller():
+            x = CustomData(1)
+            return foo(x)
+    This generates MLIR like:
+    .. code-block:: mlir
+        func @caller() -> i32 {
+            %0 = func.call @foo(%arg0) : (i32) -> i32
+            return %0 : i32
+        }
+    """
+    def __extract_mlir_values__(self):
+        """Extract MLIR values from this object.
+        :return: List of MLIR values representing this object's data
+        :rtype: List[ir.Value]
+        """
+        raise NotImplementedError
+    def __new_from_mlir_values__(self, values):
+        """Create a new instance from MLIR values.
+        :param values: List of MLIR values to construct the object from
+        :type values: List[ir.Value]
+        :return: New instance of the implementing class
+        :rtype: Any
+        """
+        raise NotImplementedError
+@runtime_checkable
+class JitArgument(Protocol):
+    """
+    Protocol class defining the interface for JIT function argument generation.
+    This protocol enables classes to provide the necessary information for generating
+    JIT function arguments and allow the DSL JIT executor to call JIT compiled functions.
+    **Required Methods**
+    * ``__c_pointers__``: Returns ctypes pointers for runtime execution
+    * ``__get_mlir_types__``: Returns MLIR types for function definition
+    * ``__new_from_mlir_values__``: Creates new instances from MLIR values
+    **Example**
+    .. code-block:: python
+        class CustomData:
+            def __init__(self, int_value, ...):
+                self.int_value = int_value
+                ...
+            def __c_pointers__(self):
+                return [ctypes.pointer(ctypes.c_int32(self.int_value)), ...]
+            def __get_mlir_types__(self):
+                return [ir.IntegerType.get(32), ...]
+            def __new_from_mlir_values__(self, values):
+                return CustomData(values[0], ...)
+        @jit
+        def foo(x: CustomData):
+            a = x.int_value + 1
+            ...
+        # `CustomData` is an argument of `foo`
+        foo(CustomData(1, ...))
+    When called like ``y = foo(x)``, the following steps occur:
+    1. JIT compiler generates MLIR function definition using ``__get_mlir_types__``
+    .. code-block:: mlir
+        func.func @foo(%arg0: i32, ...) {
+            ...
+            return
+        }
+    2. JIT function can't use values from Python, so it needs to reconstruct the object from
+    MLIR values, a.k.a `%arg0`, with ``__new_from_mlir_values__`` and pass it to `foo`.
+    Following code demonstrates how JIT compiler reconstructs the object and pass to Python.
+    .. code-block:: python
+        # Implementation of IR tracing
+        new_x = CustomData(ir.Value(%arg0), ...)
+        y = foo(new_x)
+        # `x.int_value` is %arg0 rather than `c1` defined by Python.
+    3. For Python runtime execution, JIT engine invokes compiled function using ``__c_pointers__``
+    pointing to the underlying data object passing to JIT compiled function.
+    .. code-block:: python
+        jit_engine.invoke(compiled_foo, concat([x.__c_pointers__(), ...]))
+    """
+    def __c_pointers__(self):
+        """
+        Generate a list of ctypes pointers for the current object.
+        :return: List of ctypes pointers
+        :rtype: List[ctypes.c_void_p]
+        """
+        raise NotImplementedError
+    def __get_mlir_types__(self):
+        """
+        Generate a list of MLIR types for the current object.
+        :return: List of MLIR types
+        :rtype: List[ir.Type]
+        """
+        raise NotImplementedError
+    def __new_from_mlir_values__(self, values):
+        """
+        Create a new object from MLIR values.
+        :param values: List of MLIR values
+        :type values: List[ir.Value]
+        :return: A new object that represents the given MLIR values
+        :rtype: Any
+        """
+        raise NotImplementedError
+def get_c_pointers(obj):
+    """
+    Given the `obj`, recursively go through it to extract all contained C pointers
+    """
+    if hasattr(obj, "__c_pointers__"):
+        return obj.__c_pointers__()
+    elif isinstance(obj, (tuple, list)):
+        return sum((get_c_pointers(x) for x in obj), [])
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in get_c_pointers to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    return []
+def get_mlir_types(obj):
+    """
+    Given the `obj`, recursively go through it to extract all contained MLIR types
+    """
+    if hasattr(obj, "__get_mlir_types__"):
+        return obj.__get_mlir_types__()
+    elif hasattr(obj, "__extract_mlir_values__"):
+        return [v.type for v in obj.__extract_mlir_values__()]
+    elif isinstance(obj, ir.Value):
+        return [obj.type]
+    elif isinstance(obj, (tuple, list)):
+        return sum((get_mlir_types(x) for x in obj), [])
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in get_mlir_types to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    return []
+class DslType(type):
+    """Metaclass for all DSL types in the system.
+    This metaclass provides type system infrastructure for DSL types, handling MLIR
+    type mappings and NumPy type conversions.
+    All data types in DSL must provide the following methods:
+    :param mlir_type: Corresponding MLIR type for this DSL type
+    :type mlir_type: Any, optional
+    :param is_abstract: Whether this type is abstract, defaults to False
+    :type is_abstract: bool, optional
+    **Required Methods**
+    * ``__str__`` (classmethod): Return string representation of the type
+    * ``__c_pointers__`` (optional): Return list of ctypes pointers of data used to invoke JIT function
+    * ``__get_mlir_types__``: Return list of MLIR types of the MLIR values contained in the instance
+    * ``__extract_mlir_values__``: Return list of MLIR values contained in the instance
+    * ``__new_from_mlir_values__``: Return a new instance from list of MLIR values
+    **Attributes**
+    :ivar _ir: MLIR provider
+    :vartype _ir: Any
+    :ivar _T: MLIR Type system provider
+    :vartype _T: Any
+    **Properties**
+    :property mlir_type: Returns the corresponding MLIR type for this DSL type
+    :type mlir_type: Any
+    """
+    _is_abstract: bool
+    def __new__(cls, name, bases, attrs, is_abstract=False, **kwargs):
+        new_cls = super().__new__(cls, name, bases, attrs)
+        new_cls._is_abstract = is_abstract
+        return new_cls
+    @property
+    def is_abstract(cls):
+        return cls._is_abstract
+class NumericMeta(DslType):
+    """Metaclass for numeric types providing width and numpy dtype information.
+    :param width: Bit width of the numeric type, defaults to 8
+    :type width: int
+    :param np_dtype: Corresponding NumPy dtype
+    :type np_dtype: numpy.dtype, optional
+    :param mlir_type: Corresponding MLIR type
+    :type mlir_type: Any, optional
+    :param is_abstract: Whether the type is abstract, defaults to False
+    :type is_abstract: bool, optional
+    :ivar width: Bit width of the numeric type
+    :type width: int
+    :ivar _np_dtype: Corresponding NumPy dtype
+    :type _np_dtype: Union[numpy.dtype, None]
+    :property numpy_dtype: Returns the corresponding NumPy dtype
+    :rtype numpy_dtype: numpy.dtype
+    """
+    width: int
+    # Placeholder type
+    _mlir_type = Any
+    _np_dtype: Union[np.dtype, None]
+    def __new__(
+        cls,
+        name,
+        bases,
+        attrs,
+        width=8,
+        np_dtype=None,
+        mlir_type=None,
+        is_abstract=False,
+        **kwargs,
+    ):
+        def _extract_mlir_values(self):
+            return [self.ir_value()]
+        def _new_from_mlir_values(self, values: list) -> "Numeric":
+            res_ty = type(self)
+            return res_ty(values[0])
+        new_attrs = {
+            "__extract_mlir_values__": _extract_mlir_values,
+            "__new_from_mlir_values__": _new_from_mlir_values,
+        }
+        new_cls = super().__new__(
+            cls,
+            name,
+            bases,
+            new_attrs | attrs,
+            is_abstract=is_abstract,
+            **kwargs,
+        )
+        if mlir_type is not None:
+            new_cls._mlir_type = staticmethod(mlir_type)
+        new_cls.width = width
+        new_cls._np_dtype = np_dtype
+        return new_cls
+    @property
+    def numpy_dtype(cls):
+        return cls._np_dtype
+    @property
+    def is_integer(cls) -> bool: ...
+    @property
+    def is_float(cls) -> bool: ...
+    def is_same_kind(cls, other: Type) -> bool:
+        return cls.is_integer == other.is_integer or cls.is_float == other.is_float
+    @staticmethod
+    def from_python(value: Any) -> Type["Numeric"]:
+        """
+        Deduce the DSL type from a Python value.
+        """
+        if isinstance(value, int):
+            return Int32
+        elif isinstance(value, float):
+            return Float32
+        elif isinstance(value, bool):
+            return Boolean
+        raise DSLRuntimeError(
+            f"Could not deduce Type[Numeric] from python value: {value} :{type(value)}"
+        )
+    @property
+    def mlir_type(cls):
+        return cls._mlir_type()  # type: ignore
+Value = TypeVar("Value")
+def cast(obj: Union[bool, int, float, Value], type_: Type["Numeric"]) -> "Numeric":
+    """Cast an object to the specified numeric type.
+    :param obj: Object to be cast
+    :type obj: Union[bool, int, float, Value]
+    :param type_: Target numeric type
+    :type type_: Type[Numeric]
+    :raises TypeError: If casting to an abstract type or unsupported type conversion
+    :return: Object cast to the target numeric type
+    :rtype: Numeric
+    Example::
+        >>> x = cast(5, Int32)  # Cast integer to Int32
+        >>> y = cast(3.14, Float32)  # Cast float to Float32
+    """
+    if type_.is_abstract:
+        if not isinstance(obj, type_):
+            raise TypeError(
+                f"can't cast {obj} to {type_}. Pass in concrete type instead, "
+                "e.g. Int32, Float32, etc."
+            )
+        # If target_type is abstract, and value is instance of target_type,
+        # then we can return value as is
+    else:
+        # Implicit cast based on using annotation type
+        obj = type_(obj)
+    return obj
+# Option 1: use ir.Value as base
+# class IntegerMeta(DslType, type(ir.Value)):
+class IntegerMeta(NumericMeta):
+    """Metaclass for integer types providing signedness information.
+    :param width: Bit width of the integer type, defaults to 32
+    :type width: int
+    :param signed: Whether the integer type is signed, defaults to True
+    :type signed: bool
+    :param mlir_type: Corresponding MLIR type, defaults to None
+    :type mlir_type: Any, optional
+    :ivar signed: Whether the integer type is signed
+    :vartype signed: bool
+    :ivar arith: Arithmetic operations interface
+    :vartype arith: Any
+    """
+    signed: bool
+    def __new__(
+        cls,
+        name,
+        bases,
+        attrs,
+        width=32,
+        signed=True,
+        mlir_type=None,
+        is_abstract=False,
+    ):
+        if width == 1:
+            np_dtype = np.bool_
+        elif width == 128:
+            np_dtype = None
+        elif signed:
+            np_dtype = getattr(np, f"int{width}")
+        else:
+            np_dtype = getattr(np, f"uint{width}")
+        def _c_pointers(self):
+            if width == 1:
+                c_value = ctypes.c_bool(self.value)
+            elif signed:
+                c_value = getattr(ctypes, f"c_int{width}")(self.value)
+            else:
+                c_value = getattr(ctypes, f"c_uint{width}")(self.value)
+            return [ctypes.cast(ctypes.pointer(c_value), ctypes.c_void_p)]
+        new_attrs = {
+            "__c_pointers__": _c_pointers,
+        }
+        new_cls = super().__new__(
+            cls, name, bases, attrs | new_attrs, width, np_dtype, mlir_type, is_abstract
+        )
+        new_cls.signed = signed
+        return new_cls
+    def __str__(cls):
+        return f"{cls.__name__}"
+    @property
+    def is_integer(cls) -> bool:
+        return True
+    @property
+    def is_float(cls) -> bool:
+        return False
+    @property
+    def zero(cls) -> int:
+        return 0
+    @property
+    def min(cls) -> int:
+        if cls.signed:
+            return -(2 ** (cls.width - 1))
+        else:
+            return 0
+    @property
+    def max(cls) -> int:
+        if cls.signed:
+            return 2 ** (cls.width - 1) - 1
+        else:
+            return 2**cls.width - 1
+    def recast_width(cls, width):
+        type_map = {
+            8: Int8,
+            16: Int16,
+            32: Int32,
+            64: Int64,
+            128: Int128,
+        }
+        if width not in type_map:
+            raise TypeError(f"Unsupported width: {width}")
+        return type_map[width]
+class FloatMeta(NumericMeta):
+    """Metaclass for floating-point types.
+    This metaclass provides type system infrastructure for floating-point types in the DSL,
+    handling MLIR type mappings and NumPy type conversions.
+    :param width: Bit width of the float type, defaults to 32
+    :type width: int
+    :param mlir_type: Corresponding MLIR type, defaults to None
+    :type mlir_type: Any, optional
+    :param is_abstract: Whether this is an abstract base class, defaults to False
+    :type is_abstract: bool, optional
+    :ivar _arith: Arithmetic operations interface
+    :vartype _arith: Any
+    """
+    _exponent_width: int
+    _mantissa_width: int
+    def __new__(cls, name, bases, attrs, width=32, mlir_type=None, is_abstract=False):
+        np_dtype = getattr(np, name.lower(), None)
+        new_cls = super().__new__(
+            cls, name, bases, attrs, width, np_dtype, mlir_type, is_abstract
+        )
+        # Extract exponent and mantissa bits from class name if it follows Float<E><M> pattern
+        # For example: Float8E4M3 -> exponent_width=4, mantissa_width=3
+        import re
+        if not is_abstract:
+            match = re.match(r"Float(\d+)E(\d+)M(\d+)(?:.*)", name)
+            if match:
+                exp_bits = int(match.group(2))
+                mant_bits = int(match.group(3))
+                # Store extracted values as class attributes
+                new_cls._exponent_width = exp_bits
+                new_cls._mantissa_width = mant_bits
+        # Don't have 1-to-1 mapping of narrow precision types like bfloat16, tfloat32, etc.
+        return new_cls
+    def __str__(cls):
+        return f"{cls.__name__}"
+    @property
+    def is_integer(cls) -> bool:
+        return False
+    @property
+    def is_float(cls) -> bool:
+        return True
+    @property
+    def zero(cls) -> float:
+        return 0.0
+    @property
+    def inf(cls) -> float:
+        return float("inf")
+    @property
+    def nan(cls) -> float:
+        return float("nan")
+    @property
+    def exponent_width(cls) -> int:
+        return cls._exponent_width
+    @property
+    def mantissa_width(cls) -> int:
+        return cls._mantissa_width
+    def recast_width(cls, width):
+        type_map = {
+            16: Float16,
+            32: Float32,
+            64: Float64,
+        }
+        if width not in type_map:
+            raise TypeError(f"Unsupported width: {width}")
+        return type_map[width]
+def _arith_signless_to_int(a, target_type):
+    # is_signed: sign of result type
+    if target_type.width > a.type.width:
+        # arith dialect consider `1` in `i1` as `-1`, treat it as unsigned for DSL
+        if target_type.signed and a.type.width > 1:
+            return arith.extsi(target_type.mlir_type, a)
+        else:
+            return arith.extui(target_type.mlir_type, a)
+    elif target_type.width < a.type.width:
+        return arith.trunci(target_type.mlir_type, a)
+    else:
+        return a
+def _binary_op_type_promote(a, b, promote_bool: bool = False):
+    """Promote two numeric operands following type promotion rules.
+    :param a: First numeric operand
+    :type a: Numeric
+    :param b: Second numeric operand
+    :type b: Numeric
+    :param promote_bool: Whether to promote boolean types to Int32 for arithmetic operations, defaults to False
+    :type promote_bool: bool, optional
+    :raises ValueError: If implicit float promotion is not supported between the given types
+    :return: Tuple containing promoted operands and their resulting type
+    :rtype: tuple[Numeric, Numeric, Type[Numeric]]
+    Type promotion rules:
+    1. If operands are same type and not bools needing promotion:
+       - No promotion needed, return original types
+    2. If either operand is float:
+       a. If one is float and one is int:
+          - Convert int to the float type
+       b. If both are float:
+          - Promote to higher precision float if width >= 16
+          - For same width, promote to more general type (Float32 over TFloat32)
+          - Otherwise raise ValueError for unsupported promotion
+    3. Otherwise, both operands are integers. Integer promotion rules:
+       a. If promote_bool is True and either operand is bool:
+          - Promote bool to Int32 for arithmetic operations
+    Exceptions for numpy dtype casting:
+    - array(dtype=np.bool_) + array(dtype=np.bool_) -> array(dtype=np.bool_)
+    What is not supported:
+    - promotion with narrow precision float types which requires explicit cast by user
+    """
+    a_type = a.dtype
+    b_type = b.dtype
+    # Early return for same types (except when they're bools that need promotion)
+    if a_type == b_type and not (promote_bool and a_type is Boolean):
+        return a, b, a_type
+    # Handle floating point promotions
+    if a_type.is_float or b_type.is_float:
+        # Get highest precision float type based on bitwidth
+        a_width = getattr(a_type, "width", 0)
+        b_width = getattr(b_type, "width", 0)
+        # If one type is integer, convert it to the float type
+        if a_type.is_float and not b_type.is_float:
+            b_type = a_type.recast_width(max(a_width, b_width))
+        elif b_type.is_float and not a_type.is_float:
+            a_type = b_type.recast_width(max(a_width, b_width))
+        # Both are float types - handle precision promotion
+        if a_width > b_width and a_width >= 16:
+            res_type = a_type
+        elif b_width > a_width and b_width >= 16:
+            res_type = b_type
+        elif a_width == b_width:
+            # Same bitwidth - handle special cases like TFloat32 -> Float32 and BFloat16 -> Float16
+            if a_type is Float64 or b_type is Float64:
+                res_type = Float64
+            elif a_type is Float32 or b_type is Float32:
+                res_type = Float32
+            elif a_type is Float16 or b_type is Float16:
+                res_type = Float16
+            else:
+                raise ValueError(
+                    f"implicit float promotion of {a_type} or {b_type} is not supported, cast explicitly"
+                )
+        else:
+            raise ValueError(
+                f"implicit float promotion of {a_type} or {b_type} is not supported, cast explicitly"
+            )
+        # Only convert if type is different
+        new_a = a.to(res_type) if a.dtype != res_type else a
+        new_b = b.to(res_type) if b.dtype != res_type else b
+        return new_a, new_b, res_type
+    # Handle bool promotion for arithmetic operations
+    if promote_bool:
+        if a_type is Boolean and b_type is Boolean:
+            # Only promote to Int32 when both are bool
+            a = a.to(Int32)
+            b = b.to(Int32)
+            a_type = b_type = a.dtype
+        # If both were bools, they're now same type (Int32)
+        if a_type == b_type:
+            return a, b, a_type
+    # Same type, no promotion needed
+    if a_type == b_type:
+        return a, b, a_type
+    a_signed = a_type.signed
+    b_signed = b_type.signed
+    a_width = a_type.width
+    b_width = b_type.width
+    # Mixed signedness case
+    if a_signed != b_signed:
+        unsigned_type = a_type if not a_signed else b_type
+        signed_type = a_type if a_signed else b_type
+        unsigned_width = a_width if not a_signed else b_width
+        if unsigned_width >= signed_type.width:
+            # Promote both to unsigned of larger width
+            res_type = unsigned_type
+        else:
+            # Promote both to signed of larger width
+            res_type = signed_type
+        new_a = a.to(res_type) if a.dtype != res_type else a
+        new_b = b.to(res_type) if b.dtype != res_type else b
+        return new_a, new_b, res_type
+    # Same signedness, different width - promote to larger width
+    if a_width >= b_width:
+        return a, b.to(a.dtype), a.dtype
+    else:
+        return a.to(b.dtype), b, b.dtype
+def _binary_op(op, promote_operand=True, promote_bool=False, flip=False):
+    """Wrapper for binary operations on Numeric types.
+    This wrapper handles type promotion, operation execution, and result type determination
+    for binary operations between Numeric types.
+    :param op: The binary operation to perform (e.g., operator.add, operator.sub)
+    :type op: callable
+    :param emitter: Function that emits the MLIR operation for dynamic values
+    :type emitter: callable
+    :param promote_operand: Whether to promote operands to the same type, defaults to True
+    :type promote_operand: bool, optional
+    :param promote_bool: Whether to promote boolean results to Boolean type, defaults to False
+    :type promote_bool: bool, optional
+    :param flip: Whether to flip the operands when calling the operation, defaults to False
+    :type flip: bool, optional
+    :raises TypeError: When an unsupported operation is attempted on specific numeric types
+    .. note::
+        Not all operations are supported for all numeric types. In particular:
+        - Subtraction is not fully supported for Integer types
+        - Multiplication, floor division, and modulo operations may have limited support
+        - Division (truediv) with integer types is not fully supported and converts to Float32
+    """
+    def wrapper(lhs, rhs, *, loc=None, ip=None):
+        orig_lhs_type = type(lhs)
+        orig_rhs_type = type(rhs)
+        # When called directly with self and other
+        ty = type(lhs)
+        # Canonicalize to Numeric type for promotion
+        if not isinstance(rhs, Numeric):
+            if not isinstance(rhs, (ArithValue, int, float, bool)):
+                # This allows rhs class to implement __rmul__
+                return NotImplemented
+            if isinstance(rhs, ArithValue):
+                if isinstance(rhs.type, ir.VectorType):
+                    return NotImplemented
+            rhs = as_numeric(rhs)
+        # default result type to left-hand-side
+        res_type = ty
+        if promote_operand:
+            lhs, rhs, res_type = _binary_op_type_promote(lhs, rhs, promote_bool)
+        else:
+            rhs = ty(rhs)
+        if op in (
+            operator.lt,
+            operator.le,
+            operator.gt,
+            operator.ge,
+            operator.eq,
+            operator.ne,
+        ):
+            res_type = Boolean
+        elif op == operator.truediv and isinstance(lhs, Integer):
+            res_type = Float32
+        elif promote_bool and orig_lhs_type == Boolean and orig_rhs_type == Boolean:
+            res_type = Boolean
+        if isinstance(lhs.value, ArithValue) and isinstance(lhs, Integer):
+            lhs_val = lhs.value.with_signedness(lhs.signed)
+        else:
+            lhs_val = lhs.value
+        if isinstance(rhs.value, ArithValue) and isinstance(rhs, Integer):
+            rhs_val = rhs.value.with_signedness(rhs.signed)
+        else:
+            rhs_val = rhs.value
+        if flip:
+            lhs_val, rhs_val = rhs_val, lhs_val
+        # Check if the operation is supported by the operands
+        res_val = op(lhs_val, rhs_val)
+        return res_type(res_val, loc=loc, ip=ip)
+    return wrapper
+class Numeric(metaclass=NumericMeta, is_abstract=True):
+    """Base class for all numeric types in the DSL.
+    This class provides the foundation for both Integer and Float types,
+    implementing basic arithmetic operations.
+    :param value: The value to store in the numeric type
+    :type value: Union[bool, int, float, Value]
+    :ivar value: The stored numeric value
+    :vartype value: Union[bool, int, float, Value]
+    """
+    def __init__(self, value: Union[bool, int, float, Value], *, loc=None, ip=None):
+        self.value = value
+    def __str__(self) -> str:
+        # Use member's pretty-str method if member object has method.
+        # This can be extended in future to have better support for IDE, jupyter notebook, etc.
+        pretty_str = getattr(self.value, "pretty_str", None)
+        if pretty_str is not None:
+            return pretty_str()
+        else:
+            return "?"
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({repr(self.value)})"
+    def __hash__(self):
+        return hash(type(self).__class__) ^ hash(self.value)
+    @property
+    def dtype(self) -> Type["Numeric"]:
+        return type(self)
+    @overload
+    def to(self, dtype: Type["Numeric"], *, loc=None, ip=None) -> "Numeric": ...
+    @overload
+    def to(self, dtype: Type[int], *, loc=None, ip=None) -> int: ...
+    @overload
+    def to(self, dtype: Type[float], *, loc=None, ip=None) -> float: ...
+    @overload
+    def to(self, dtype: Type[bool], *, loc=None, ip=None) -> bool: ...
+    @overload
+    def to(self, dtype: Type[ir.Value], *, loc=None, ip=None) -> ir.Value: ...
+    def to(self, dtype: Type, *, loc=None, ip=None):
+        """Convert this numeric value to another numeric type.
+        If the target type is the same as the current type, returns self.
+        Otherwise, creates a new instance of the target type with the same value.
+        :param dtype: The target numeric type to convert to
+        :type dtype: Union[Type["Numeric"], Type[int], Type[float], Type[bool]]
+        :return: A new instance of the target type, or self if types match
+        :rtype: Numeric
+        :raises TypeError: If trying to convert an MLIR value to a static Python type
+        :raises TypeError: If trying to convert to unsupported float types like Float8E4M3,
+                          Float8E4M3B11FNUZ, Float4E2M1FN, Float6E3M2FN, or Float6E2M3FN
+        .. note::
+            Unsupported destination float types:
+                - Float8E4M3
+                - Float8E4M3B11FNUZ
+                - Float4E2M1FN
+                - Float6E3M2FN
+                - Float6E2M3FN
+        Example::
+            .. code-block:: python
+                # Convert between DSL numeric types
+                x = Int32(5)
+                y = x.to(Float32)  # Converts to Float32(5.0)
+                # Convert to Python primitive types
+                # They are considered as static values at JIT time
+                z = x.to(int)      # Returns Python int 5
+                w = y.to(float)    # Returns Python float 5.0
+                # This will raise a ValueError
+                mlir_val = arith.constant(T.i32(), 42)
+                num = Int32(mlir_val)
+                num.to(int)        # ValueError: unable to convert MLIR value to static type: <class 'int'>
+        """
+        if dtype in _unsupported_dst_float_types:
+            raise TypeError(f"Unsupported destination float type: {dtype}")
+        if isinstance(dtype, type(self)):
+            return self
+        elif isinstance(dtype, NumericMeta):
+            return dtype(self)
+        elif dtype is ir.Value:
+            if isinstance(self.value, (int, float, bool)):
+                res = arith_helper.const(
+                    self.value, self.dtype.mlir_type, loc=loc, ip=ip
+                )
+            elif isinstance(self.value, ir.Value):
+                res = self.value
+            else:
+                raise ValueError(
+                    f"cannot convert {type(self)} to {dtype}, "
+                    f"self.value is {self.value.type}"
+                )
+            if not isinstance(res, ArithValue):
+                raise ValueError(f"Expected ArithValue, got {type(res)} as {res.type}")
+            return res.with_signedness(getattr(type(self), "signed", None))
+        elif dtype in (int, float, bool):
+            if isinstance(self.value, ir.Value):
+                raise ValueError(
+                    f"unable to convert {self.value} to static type: {dtype}"
+                )
+            return dtype(self.value)
+        else:
+            raise ValueError(f"unable to convert {type(self)} to {dtype}")
+    def ir_value(self, *, loc=None, ip=None) -> ir.Value:
+        return self.to(ir.Value, loc=loc, ip=ip)
+    @property
+    def zero(self) -> "Numeric": ...
+    def __dsl_not__(self, *, loc=None, ip=None):
+        """DSL implementation of Python's `not` operator.
+        Returns True if the value is equal to zero, False otherwise.
+        This matches Python's behavior where any non-zero number is considered True.
+        :param loc: The source location information, defaults to None
+        :type loc: Optional[Location]
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: The result of the logical not operation
+        :rtype: Boolean
+        """
+        if isinstance(self.value, (int, float, bool)):
+            return not self.value
+        else:
+            ty = type(self)
+            zero_val = arith.constant(ty.mlir_type, ty.zero)
+            return self.__eq__(ty(zero_val), loc=loc, ip=ip)
+    def __dsl_and__(self, other, *, loc=None, ip=None):
+        """DSL implementation of Python's `and` operator.
+        Returns the second operand if the first is truthy, otherwise returns the first operand.
+        A numeric value is considered truthy if it is non-zero.
+        :param other: The right-hand operand
+        :type other: Numeric
+        :param loc: The source location information, defaults to None
+        :type loc: Optional[Location]
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: The result of the logical and operation
+        :rtype: Boolean
+        Example::
+            5 and 3 -> 3
+            0 and 3 -> 0
+            3 and 0 and ... -> 0
+        """
+        is_true = self.__dsl_bool__(loc=loc, ip=ip)
+        def and_op(lhs, rhs):
+            if isinstance(lhs, (int, float, bool)):
+                if isinstance(rhs, (int, float, bool)):
+                    return lhs and rhs
+                else:
+                    lhs = arith.constant(rhs.type, lhs)
+                    return arith.select(is_true.ir_value(), rhs, lhs, loc=loc, ip=ip)
+            else:
+                if isinstance(rhs, (int, float, bool)):
+                    rhs = arith.constant(lhs.type, rhs)
+                    return arith.select(is_true.ir_value(), rhs, lhs, loc=loc, ip=ip)
+                else:
+                    return arith.select(is_true.ir_value(), rhs, lhs, loc=loc, ip=ip)
+        return _binary_op(and_op, promote_bool=True)(self, other, loc=loc, ip=ip)
+    def __dsl_or__(self, other, *, loc=None, ip=None):
+        """DSL implementation of Python's `or` operator.
+        Returns the first operand if it is truthy, otherwise returns the second operand.
+        A numeric value is considered truthy if it is non-zero.
+        :param other: The right-hand operand
+        :type other: Numeric
+        :param loc: The source location information, defaults to None
+        :type loc: Optional[Location]
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: The result of the logical or operation
+        :rtype: Boolean
+        Example::
+            5 or 3 -> 5
+            0 or 3 -> 3
+            3 or 0 -> 3
+        """
+        is_true = self.__dsl_bool__(loc=loc, ip=ip)
+        def or_op(lhs, rhs):
+            if isinstance(lhs, (int, float, bool)):
+                if isinstance(rhs, (int, float, bool)):
+                    return lhs or rhs
+                else:
+                    lhs = arith.constant(rhs.type, lhs)
+                    return arith.select(is_true.ir_value(), lhs, rhs, loc=loc, ip=ip)
+            else:
+                if isinstance(rhs, (int, float, bool)):
+                    rhs = arith.constant(lhs.type, rhs)
+                    return arith.select(is_true.ir_value(), lhs, rhs, loc=loc, ip=ip)
+                else:
+                    return arith.select(is_true.ir_value(), lhs, rhs, loc=loc, ip=ip)
+        return _binary_op(or_op, promote_bool=True)(self, other, loc=loc, ip=ip)
+    def __dsl_bool__(self, *, loc=None, ip=None) -> "Boolean":
+        """DSL implementation of Python's __bool__ method.
+        Returns a Boolean indicating whether this value is considered truthy.
+        For numeric types, returns True if the value is non-zero.
+        :param loc: The source location information, defaults to None
+        :type loc: Optional[Location]
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: True if this value is truthy (non-zero), False otherwise
+        :rtype: Boolean
+        """
+        zero = type(self).zero
+        return self.__ne__(zero, loc=loc, ip=ip)
+    def __bool__(self):
+        if isinstance(self.value, (int, float, bool)):
+            return bool(self.value)
+        else:
+            raise DSLRuntimeError(
+                f"Unable to convert dynamic `{type(self).__name__}` value to bool at compile time.",
+                suggestion=[
+                    "Decorate the parent function with `jit` decorator and with `preprocess` enabled.",
+                    "Ensure not using patterns that DSL does not support.",
+                    "Otherwise, please file a bug report.",
+                ],
+            )
+    def __index__(self):
+        if isinstance(self.value, (int, float, bool)):
+            return self.value
+        else:
+            raise DSLRuntimeError(
+                f"'{type(self.value)}' object cannot be interpreted as an integer",
+                suggestion="Mark the loop as dynamic with `dynamic_expr` or `range_dynamic` and decorate the parent function with `jit` decorator",
+            )
+    def __neg__(self, *, loc=None, ip=None):
+        if isinstance(self, (bool, int, float)):
+            return type(self)(-self.value)  # type: ignore
+        else:
+            return type(self)(-self.value, loc=loc, ip=ip)  # type: ignore
+    @staticmethod
+    def _from_python_value(value):
+        if isinstance(value, Numeric):
+            return value
+        if isinstance(value, bool):
+            res_type = Boolean
+        elif isinstance(value, int):
+            res_type = Int32
+        elif isinstance(value, float):
+            res_type = Float32
+        elif isinstance(value, ArithValue):
+            res_type = Numeric.from_mlir_type(value.type)
+        else:
+            raise ValueError(
+                f"unable to convert {value} in type {type(value)} to Numeric"
+            )
+        return res_type(value)
+    def __add__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.add, promote_bool=True)(self, other, loc=loc, ip=ip)
+    def __sub__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.sub, promote_bool=True)(self, other, loc=loc, ip=ip)
+    def __mul__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.mul, promote_bool=True)(self, other, loc=loc, ip=ip)
+    def __floordiv__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.floordiv, promote_bool=True)(
+            self, other, loc=loc, ip=ip
+        )
+    def __truediv__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.truediv, promote_bool=True)(
+            self, other, loc=loc, ip=ip
+        )
+    def __mod__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.mod, promote_bool=True)(self, other, loc=loc, ip=ip)
+    def __radd__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return self.__add__(other, loc=loc, ip=ip)
+    def __rsub__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.sub, promote_bool=True, flip=True)(
+            self, other, loc=loc, ip=ip
+        )
+    def __rmul__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return self.__mul__(other, loc=loc, ip=ip)
+    def __rfloordiv__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.floordiv, promote_bool=True, flip=True)(
+            self, other, loc=loc, ip=ip
+        )
+    def __rtruediv__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.truediv, promote_bool=True, flip=True)(
+            self, other, loc=loc, ip=ip
+        )
+    def __rmod__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.mod, promote_bool=True, flip=True)(
+            self, other, loc=loc, ip=ip
+        )
+    def __eq__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.eq)(self, other, loc=loc, ip=ip)  # type: ignore
+    def __ne__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.ne)(self, other, loc=loc, ip=ip)  # type: ignore
+    def __lt__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.lt)(self, other, loc=loc, ip=ip)  # type: ignore
+    def __le__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.le)(self, other, loc=loc, ip=ip)  # type: ignore
+    def __gt__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.gt)(self, other, loc=loc, ip=ip)  # type: ignore
+    def __ge__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.ge)(self, other, loc=loc, ip=ip)  # type: ignore
+    def __pow__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.pow)(self, other, loc=loc, ip=ip)  # type: ignore
+    def __c_pointers__(self):
+        raise ValueError(
+            f"only support built-in types: bool, (u)int{8, 16, 32, 64}, float{32, 64}, but got {type(self)}"
+        )
+    def __get_mlir_types__(self):
+        return [type(self).mlir_type]
+    @staticmethod
+    def from_mlir_type(mlir_type):
+        type_map = {
+            T.bool(): Boolean,
+            T.f64(): Float64,
+            T.f32(): Float32,
+            T.tf32(): TFloat32,
+            T.f16(): Float16,
+            T.bf16(): BFloat16,
+            T.i(128): Int128,
+            T.i64(): Int64,
+            T.i32(): Int32,
+            T.i16(): Int16,
+            T.i8(): Int8,
+            T.si(128): Int128,
+            T.si64(): Int64,
+            T.si32(): Int32,
+            T.si16(): Int16,
+            T.si8(): Int8,
+            T.ui(128): Uint128,
+            T.ui64(): Uint64,
+            T.ui32(): Uint32,
+            T.ui16(): Uint16,
+            T.ui8(): Uint8,
+            T.f8E5M2(): Float8E5M2,
+            T.f8E4M3(): Float8E4M3,
+            T.f8E4M3FN(): Float8E4M3FN,
+            T.f8E4M3B11FNUZ(): Float8E4M3B11FNUZ,
+            T.f4E2M1FN(): Float4E2M1FN,
+            T.f6E2M3FN(): Float6E2M3FN,
+            T.f6E3M2FN(): Float6E3M2FN,
+            T.f8E8M0FNU(): Float8E8M0FNU,
+        }
+        if mlir_type not in type_map:
+            raise DSLRuntimeError(f"Unsupported DSL type: {mlir_type}")
+        return type_map[mlir_type]
+def as_numeric(obj: Union[bool, int, float, ir.Value, Numeric]) -> Numeric:
+    """Convert a Python primitive value to a Numeric type.
+    :param obj: Python primitive value to convert
+    :type obj: Union[bool, int, float]
+    :return: The converted Numeric object
+    :rtype: Numeric
+    Example::
+        .. code-block:: python
+            x = as_numeric(5)  # Converts to Int32
+            y = as_numeric(3.14)  # Converts to Float32
+            z = as_numeric(True)  # Converts to Boolean
+    """
+    if isinstance(obj, Numeric):
+        return obj
+    return Numeric._from_python_value(obj)
+class Integer(Numeric, metaclass=IntegerMeta, mlir_type=T.i32, is_abstract=True):
+    """A class representing integer values with specific width and signedness.
+    This class provides functionality to create and manipulate integer values with
+    configurable width and signedness. It supports conversion from various input types
+    including Python scalars, MLIR Values, and other numeric types.
+    :param x: The input value to convert to this integer type
+    :type x: Union[bool, int, float, ir.Value, Integer, Float]
+    :return: A new Integer instance with the converted value
+    :rtype: Integer
+    :raises AssertionError: If the type's numpy_dtype is None
+    :raises NotImplementedError: If converting between different Integer types
+    :raises ValueError: If the input type is not supported for conversion
+    :raises OverflowError: If converting float infinity to integer
+    Type conversion behavior:
+    * Python scalars (bool, int, float):
+        * Converted through numpy dtype casting
+        * NaN and infinity values are rejected
+        * Example: Int8(256) -> -256 (overflow behavior)
+    * MLIR Value with IntegerType:
+        * Width differences handled by signless to signed/unsigned conversion
+        * Example: i8 -> i8/ui8 depending on target type
+    * MLIR Value with FloatType:
+        * Uses MLIR float-to-int conversion
+        * NaN and infinity values is undefined behavior
+        * Example: f32 -> i32/ui32 depending on target type
+    * Integer:
+        * Uses MLIR float-to-int conversion or numpy dtype casting
+        * Example: Int32(Int32(5)) => 5
+    * Float:
+        * Uses MLIR float-to-int conversion
+        * Example: Int32(Float(5.7)) -> 5
+    Example usage:
+    .. code-block:: python
+        x = Int32(5)  # From integer
+        y = Int32(True)  # From boolean
+        z = Int32(3.7)  # From float (truncates)
+        w = Int32(x)  # From same Integer type
+        c5 = arith.constant(5, T.i32())
+        a = Int32(c5)  # Treat c5 as int32 bitwise
+    """
+    def __init__(self, x, *, loc=None, ip=None):
+        ty = type(self)
+        if isinstance(x, (bool, int, float)):
+            # Add check for NaN before numpy conversion
+            if isinstance(x, float):
+                if np.isnan(x):
+                    raise ValueError("Cannot convert float NaN to integer")
+                elif np.isinf(x):
+                    raise OverflowError("Cannot convert float infinity to integer")
+            np_dtype = ty.numpy_dtype
+            assert np_dtype is not None, f"expects numpy.dtype, but got {np_dtype}"
+            x_val = int(np.array(x).astype(np_dtype))
+        elif type(x) == ty:
+            x_val = x.value
+        elif isinstance(x, ir.Value):  # type: ignore
+            x_val = x
+            if isinstance(x.type, ir.IntegerType):  # type: ignore
+                if x.type.width != ty.width:
+                    # signless -> (u)int
+                    x_val = _arith_signless_to_int(x, ty)
+            elif isinstance(x.type, ir.FloatType):  # type: ignore
+                # float -> (u)int
+                x_val = arith_helper.fptoi(x, ty.signed, ty.mlir_type, loc=loc, ip=ip)
+        elif isinstance(x, Integer):
+            if isinstance(x.value, ir.Value):
+                x_val = arith_helper.int_to_int(x.ir_value(), ty)
+            else:
+                # For non-MLIR values, use numpy casting
+                src_val = np.array(x.value, dtype=type(x).numpy_dtype)
+                x_val = int(src_val.astype(ty.numpy_dtype))
+        elif isinstance(x, Float):
+            # float -> int is handled by Integer.__init__ recursively
+            Integer.__init__(self, x.value)
+            return
+        else:
+            raise DSLRuntimeError(f"{x} to integer conversion is not supported")
+        super().__init__(x_val)
+    def __invert__(self, *, loc=None, ip=None):
+        res_type = type(self)
+        return res_type(self.ir_value(loc=loc, ip=ip).__invert__(loc=loc, ip=ip))
+    def __lshift__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.lshift)(self, other, loc=loc, ip=ip)
+    def __rlshift__(self, other, *, loc=None, ip=None):
+        other_ = as_numeric(other)
+        if not isinstance(other_, Integer):
+            raise ValueError(f"Cannot left shift {other_} with {self}")
+        return other_.__lshift__(self, loc=loc, ip=ip)
+    def __rshift__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.rshift)(self, other, loc=loc, ip=ip)
+    def __rrshift__(self, other, *, loc=None, ip=None):
+        other_ = as_numeric(other)
+        if not isinstance(other_, Integer):
+            raise ValueError(f"Cannot right shift {other_} with {self}")
+        return other_.__rshift__(self, loc=loc, ip=ip)
+    def __and__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.and_)(self, other, loc=loc, ip=ip)
+    def __rand__(self, other, *, loc=None, ip=None):
+        return self.__and__(other, loc=loc, ip=ip)
+    def __or__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.or_)(self, other, loc=loc, ip=ip)
+    def __ror__(self, other, *, loc=None, ip=None):
+        return self.__or__(other, loc=loc, ip=ip)
+    def __xor__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.xor)(self, other, loc=loc, ip=ip)
+    def __rxor__(self, other, *, loc=None, ip=None):
+        return self.__xor__(other, loc=loc, ip=ip)
+class Float(Numeric, metaclass=FloatMeta, mlir_type=T.f32, is_abstract=True):
+    """A class representing floating-point values.
+    :param x: The input value to convert to this float type.
+    :type x: Union[bool, int, float, ir.Value, Integer, Float]
+    Type conversion behavior:
+    1. Python scalars (bool, int, float):
+       - Converted through numpy dtype casting
+       - Example: Float32(1.7) -> 1.7
+    2. MLIR Value with FloatType:
+       - If width differs: converts between float types
+       - Example: f16 -> f32
+    3. MLIR Value with IntegerType:
+       - Not supported, raises ValueError
+    4. Integer:
+       - Converts using MLIR int-to-float operation
+       - Example: Float32(Int32(5)) -> 5.0
+    5. Float:
+       - Direct conversion between float types
+       - Example: Float32(Float32(1.5)) -> 1.5
+    .. note::
+        The following narrow precision types are only supported in device code:
+        8-bit float types:
+            - Float8E5M2
+            - Float8E4M3
+            - Float8E4M3FN
+            - Float8E8M0FNU
+            - Float8E4M3B11FNUZ
+        6-bit float types:
+            - Float6E3M2FN
+            - Float6E2M3FN
+        4-bit float types:
+            - Float4E2M1FN
+        Narrow precision types and special floating-point formats support matrix on device:
+    :raises AssertionError: If the type's numpy_dtype is None
+    :raises ValueError: If conversion from the input type is not supported
+    """
+    def __init__(self, x, *, loc=None, ip=None):
+        ty = type(self)
+        if isinstance(x, (bool, int, float)):  # type: ignore
+            # Why we need to convert x to with numpy?
+            # np_dtype = ty.numpy_dtype
+            # assert np_dtype is not None, f"expects numpy.dtype, but got {np_dtype}"
+            # x = float(np.array(x).astype(np_dtype))
+            super().__init__(float(x))
+        elif isinstance(x, ir.Value):  # type: ignore
+            if isinstance(x.type, ir.IntegerType):  # type: ignore
+                raise DSLRuntimeError("signless to float conversion is not implemented")
+            elif isinstance(x.type, ir.FloatType):  # type: ignore
+                if x.type != ty.mlir_type:
+                    x = arith_helper.cvtf(x, ty.mlir_type, loc=loc, ip=ip)
+            super().__init__(x)
+        elif isinstance(x, Integer):
+            if isinstance(x.value, ir.Value):  # type: ignore
+                x = arith_helper.itofp(
+                    x.value, type(x).signed, ty.mlir_type, loc=loc, ip=ip
+                )
+            else:
+                x = float(x.value)
+            super().__init__(x)
+        elif isinstance(x, Float):
+            Float.__init__(self, x.value)
+        else:
+            raise DSLRuntimeError(f"{x} to Float conversion is not supported")
+class Boolean(Integer, metaclass=IntegerMeta, width=1, signed=True, mlir_type=T.bool):
+    """Boolean type representation in the DSL.
+    This class represents boolean values in the DSL, with a width of 1 bit.
+    It supports conversion from various types to boolean values.
+    :param a: Value to convert to Boolean
+    :type a: Union[bool, int, float, "Value", Numeric]
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point for MLIR operations, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+    :raises DSLRuntimeError: If the input value cannot be converted to Boolean
+    Conversion rules:
+    1. Python bool/int/float:
+       - Converted using Python's bool() function
+       - Example: Boolean(1) -> True, Boolean(0) -> False
+    2. Numeric:
+       - Uses the Numeric.value to construct Boolean recursively
+    3. MLIR Value with IntegerType:
+       - If width is 1: Direct assignment
+       - Otherwise: Compares with 0 using arith.cmpi
+    4. MLIR Value with FloatType:
+       - Compares with 0.0 using arith.cmpf
+       - Uses unordered comparison to handle NaN values
+    """
+    def __init__(
+        self, a: Union[bool, int, float, ir.Value, Numeric], *, loc=None, ip=None
+    ):
+        value = None
+        if isinstance(a, (bool, int, float)):
+            value = bool(a)
+        elif isinstance(a, Numeric):
+            Boolean.__init__(self, a.value, loc=loc, ip=ip)
+            return
+        elif isinstance(a, ArithValue):
+            if a.type == T.bool():
+                value = a
+            else:
+                value = a != arith_helper.const(0, a.type, loc=loc, ip=ip)
+        if value is None:
+            raise DSLRuntimeError(f"Cannot convert {a} to Boolean")
+        super().__init__(value, loc=loc, ip=ip)
+        self._value_int8 = None
+    def ir_value_int8(self, *, loc=None, ip=None):
+        """
+        Returns int8 ir value of Boolean.
+        When we need to store Boolean tensor element, use ir_value_int8().
+        :param loc: Source location information, defaults to None
+        :type loc: Optional[Location], optional
+        :param ip: Insertion point for MLIR operations, defaults to None
+        :type ip: Optional[InsertionPoint], optional
+        :return: The int8 value of this Boolean
+        :rtype: ir.Value
+        """
+        if self._value_int8 is not None:
+            return self._value_int8
+        self._value_int8 = Int8(self.value, loc=loc, ip=ip).ir_value()
+        return self._value_int8
+    def __neg__(self, *, loc=None, ip=None):
+        """Negation operator is not supported for boolean type.
+        :param loc: Source location information, defaults to None
+        :type loc: Optional[Location], optional
+        :param ip: Insertion point for MLIR operations, defaults to None
+        :type ip: Optional[InsertionPoint], optional
+        :raises TypeError: Always raises this error as negation is not supported
+        """
+        raise TypeError("Negation, the operator `-` is not supported for boolean type")
+class Int8(Integer, metaclass=IntegerMeta, width=8, signed=True, mlir_type=T.i8): ...
+class Int16(Integer, metaclass=IntegerMeta, width=16, signed=True, mlir_type=T.i16): ...
+class Int32(Integer, metaclass=IntegerMeta, width=32, signed=True, mlir_type=T.i32): ...
+class Int64(Integer, metaclass=IntegerMeta, width=64, signed=True, mlir_type=T.i64): ...
+class Int128(
+    Integer, metaclass=IntegerMeta, width=128, signed=True, mlir_type=lambda: T.i(128)
+): ...
+class Uint8(Integer, metaclass=IntegerMeta, width=8, signed=False, mlir_type=T.i8): ...
+class Uint16(
+    Integer, metaclass=IntegerMeta, width=16, signed=False, mlir_type=T.i16
+): ...
+class Uint32(
+    Integer, metaclass=IntegerMeta, width=32, signed=False, mlir_type=T.i32
+): ...
+class Uint64(
+    Integer, metaclass=IntegerMeta, width=64, signed=False, mlir_type=T.i64
+): ...
+class Uint128(
+    Integer, metaclass=IntegerMeta, width=128, signed=False, mlir_type=lambda: T.i(128)
+): ...
+class Float64(Float, metaclass=FloatMeta, width=64, mlir_type=T.f64):
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+        return [
+            ctypes.cast(ctypes.pointer(ctypes.c_double(self.value)), ctypes.c_void_p)
+        ]
+class Float32(Float, metaclass=FloatMeta, width=32, mlir_type=T.f32):
+    @staticmethod
+    def _get_c_pointer(value: float):
+        return ctypes.cast(ctypes.pointer(ctypes.c_float(value)), ctypes.c_void_p)
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+        return [Float32._get_c_pointer(self.value)]
+class TFloat32(Float, metaclass=FloatMeta, width=32, mlir_type=T.tf32):
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+        return [Float32._get_c_pointer(self.value)]
+class Float16(Float, metaclass=FloatMeta, width=16, mlir_type=T.f16):
+    @staticmethod
+    def _get_c_pointer(value: float):
+        # Convert float to float16 binary representation
+        # First convert to numpy float16 to handle the conversion
+        f16_val = np.float16(value)
+        # Get the raw bits as a 16-bit integer
+        bits = f16_val.view(np.uint16)
+        # Create a short (16-bit int) with those bits
+        c_val = ctypes.c_short(bits)
+        return ctypes.cast(ctypes.pointer(c_val), ctypes.c_void_p)
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+        return [Float16._get_c_pointer(self.value)]
+class BFloat16(Float, metaclass=FloatMeta, width=16, mlir_type=T.bf16):
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+        return Float.__c_pointers__(self)
+class Float8E5M2(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E5M2): ...
+class Float8E4M3FN(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E4M3FN): ...
+class Float8E4M3B11FNUZ(
+    Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E4M3B11FNUZ
+): ...
+# Added missing float types
+class Float8E4M3(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E4M3): ...
+class Float8E8M0FNU(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E8M0FNU): ...
+class Float4E2M1FN(Float, metaclass=FloatMeta, width=4, mlir_type=T.f4E2M1FN): ...
+class Float6E3M2FN(Float, metaclass=FloatMeta, width=6, mlir_type=T.f6E3M2FN): ...
+class Float6E2M3FN(Float, metaclass=FloatMeta, width=6, mlir_type=T.f6E2M3FN): ...
+_unsupported_dst_float_types = [
+    Float8E4M3,
+    Float8E4M3B11FNUZ,
+    Float4E2M1FN,
+    Float6E3M2FN,
+    Float6E2M3FN,
+]
+ALL_DTYPES = {
+    Int8,
+    Int16,
+    Int32,
+    Int64,
+    Int128,
+    Uint8,
+    Uint16,
+    Uint32,
+    Uint64,
+    Uint128,
+    BFloat16,
+    Float16,
+    Float32,
+    TFloat32,
+    Float64,
+    Float8E5M2,
+    Float8E4M3,
+    Float8E4M3FN,
+    Float8E8M0FNU,
+    Float8E4M3B11FNUZ,
+    Float4E2M1FN,
+    Float6E2M3FN,
+    Float6E3M2FN,
+}
+__STR_TO_DTYPE__ = {dt.__name__: dt for dt in ALL_DTYPES}
+def dtype(dtype_) -> Type[Numeric]:
+    t = None
+    if const_expr(isinstance(dtype_, str) and dtype_ in __STR_TO_DTYPE__):
+        t = __STR_TO_DTYPE__[dtype_]
+    else:
+        raise TypeError(f"can't interpret {dtype_} as data type")
+    return t
+##############################################################
+# Tensor
+##############################################################
+class TensorMeta(DslType):
+    _element_type = Any
+    _shape = Any
+    """
+    Examples:
+        >>> Tensor[Int32, (3,)]
+        >>> Tensor[Float32, (3, 4)]
+        >>> T = TypeVar("T")
+        >>> Tensor[T, (3, 4, 5)]
+    """
+    def __new__(cls, name, bases, attrs, element_type=Any, shape=Any):
+        new_cls = super().__new__(cls, name, bases, attrs)
+        new_cls._element_type = element_type
+        new_cls._shape = shape
+        return new_cls
+# Generic type
+TY = TypeVar("TY")
+class Constexpr(Generic[TY]):
+    """Value is passed and computed by python interpreter"""
+    pass
+class align:
+    def __init__(self, value: int):
+        if value <= 0 or (value & (value - 1)) != 0:
+            raise DSLRuntimeError("expects align be power of 2 as positive value")
+        self._value = value
+    def __str__(self):
+        return f"align({self._value})"
+class PointerMeta(DslType):
+    def __new__(cls, name, bases, attrs, value_type=Int32, align_=align(1)):
+        new_cls = super().__new__(
+            cls,
+            name,
+            bases,
+            attrs,
+            mlir_type=lambda: getattr(ir, "UnrankedMemRefType").get(
+                value_type.mlir_type, getattr(ir, "Attribute").parse("0")
+            ),
+        )
+        new_cls._value_type = value_type
+        new_cls._align = align_
+        return new_cls
+    def __eq__(cls, other):
+        if not isinstance(other, PointerMeta):
+            return False
+        return (
+            cls._value_type == other._value_type
+            and cls._align._value == other._align._value
+        )  # Compare alignment values
+    def __hash__(cls):
+        return hash((cls._value_type, cls._align._value))  # Hash alignment value
+    def __getitem__(cls, params) -> Type["Pointer"]:
+        value_type, align_ = params
+        if not isinstance(align_, align):
+            raise DSLRuntimeError(f"expects align but got {align_}")
+        # Create new class with proper name and parameters
+        new_cls = type(
+            f"Pointer[{value_type.__name__}, {align_}]",
+            (Pointer,),
+            {},
+            value_type=value_type,
+            align_=align_,  # Pass alignment to __new__
+        )
+        return new_cls
+    def __str__(cls):
+        return f"ptr<{cls._value_type}, {cls._align}>"
+class Pointer(metaclass=PointerMeta):
+    """
+    A pointer to a memory location.
+    Examples:
+        def foo(a : Pointer[Int32, align=8]):
+            ...
+    """
+    def __init__(self, value):
+        self.value = value
+    def __str__(self):
+        return f"{self.value} : {type(self)}"
+class IRConst(Generic[TY]):
+    """Value is passed as MLIR constant value for (arith.constant)."""
+    def __init__(self, ty: TY):
+        self.ty = ty
+class IRValue(Generic[TY]):
+    """Value is passed as MLIR dynamic value."""
+    def __init__(self, ty: TY):
+        self.ty = ty
+class IRVariadic:
+    """
+    A helper class to pass a variadic number of arguments to a function.
+    """
+    def __init__(self, operands):
+        """
+        Create a list of variadic operands. `operands` must be dynamic values.
+        """
+        self.operands = operands
+    def block_arg_types(self):
+        """
+        Return the list of block args types.
+        """
+        return [operand.type for operand in self.operands]
+    def set_func_args(self, block_args):
+        """
+        This function is called after entering a function. `block_args` are the
+        block arguments that correspond to the passed operands. Derived classes
+        may implement this function to provide convenience getters for block
+        arguments.
+        """
+        pass
+    def __len__(self):
+        """
+        Return the length of variadic operands.
+        """
+        return len(self.operands)
+class FuncArgWithAttr(IRValue):
+    """
+    This derived class is specifically for func op arg with attr
+    """
+    def __init__(self, ty, attr_name, attr_ty, attr_value=None):
+        super().__init__(ty)
+        assert attr_name is not None and (
+            attr_ty is not None or attr_value is not None
+        ), "Invalid attr_name and/or attr_ty and/or attr_value for FuncArgWithAttr"
+        self.attr_name = attr_name
+        self.attr_ty = attr_ty
+        self.attr_value = attr_value
+def implicitDowncastNumericType(value):
+    if isinstance(value, Numeric):
+        return value.ir_value()
+    return value
+__all__ = [
+    "DslType",
+    "Numeric",
+    "NumericMeta",
+    "IntegerMeta",
+    "FloatMeta",
+    "Boolean",
+    "Integer",
+    "Int16",
+    "Int32",
+    "Int64",
+    "Int128",
+    "Int8",
+    "Uint8",
+    "Uint16",
+    "Uint32",
+    "Uint64",
+    "Uint128",
+    "Float",
+    "Float16",
+    "BFloat16",
+    "TFloat32",
+    "Float32",
+    "Float64",
+    "Float8E5M2",
+    "Float8E4M3",
+    "Float8E4M3FN",
+    "Float8E4M3B11FNUZ",
+    "Float8E4M3",
+    "Float8E8M0FNU",
+    "Float4E2M1FN",
+    "Float6E2M3FN",
+    "Float6E3M2FN",
+    "as_numeric",
+    "align",
+    "Pointer",
+    "dtype",
+    "Constexpr",
+    "IRConst",
+    "IRValue",
+    "IRVariadic",
+    "implicitDowncastNumericType",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from . import stacktrace
+from . import logger
+from . import timer
+__all__ = [
+    "logger",
+    "timer",
+    "stacktrace",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/logger.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides logging helper functions
+"""
+import logging
+logger = None
+def log():
+    return logger
+def setup_log(
+    name, log_to_console=False, log_to_file=False, log_file_path=None, log_level=1
+):
+    """Set up and configure a logger with console and/or file handlers.
+    :param name: Name of the logger to create
+    :type name: str
+    :param log_to_console: Whether to enable logging to console, defaults to False
+    :type log_to_console: bool, optional
+    :param log_to_file: Whether to enable logging to file, defaults to False
+    :type log_to_file: bool, optional
+    :param log_file_path: Path to the log file, required if log_to_file is True
+    :type log_file_path: str, optional
+    :param log_level: Logging level to set, defaults to 1
+    :type log_level: int, optional
+    :raises ValueError: If log_to_file is True but log_file_path is not provided
+    :return: Configured logger instance
+    :rtype: logging.Logger
+    """
+    # Create a custom logger
+    global logger
+    logger = logging.getLogger(name)
+    if log_to_console or log_to_file:
+        logger.setLevel(log_level)
+    else:
+        # Makes sure logging is OFF
+        logger.setLevel(logging.CRITICAL + 1)
+    # Clear existing handlers to prevent duplicate logs
+    if logger.hasHandlers():
+        logger.handlers.clear()
+    # Define formatter
+    formatter = logging.Formatter(
+        f"%(asctime)s - %(name)s - %(levelname)s - [%(funcName)s] - %(message)s"
+    )
+    # Add console handler if enabled
+    if log_to_console:
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(log_level)
+        console_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
+    # Add file handler if enabled
+    if log_to_file:
+        if not log_file_path:
+            raise ValueError("log_file_path must be provided when enable_file is True")
+        file_handler = logging.FileHandler(log_file_path)
+        file_handler.setLevel(log_level)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+    return logger
+logger = setup_log("generic")

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/stacktrace.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+ This module provides stacktrace helper functions
+"""
+import os
+import re
+def walk_to_top_module(start_path):
+    """
+    Walk up from the start_path to find the top-level Python module.
+    :param start_path: The path to start from.
+    :return: The path of the top-level module.
+    """
+    current_path = start_path
+    while True:
+        # Check if we are at the root directory
+        if os.path.dirname(current_path) == current_path:
+            break
+        # Check for __init__.py
+        init_file_path = os.path.join(current_path, "__init__.py")
+        if os.path.isfile(init_file_path):
+            # If __init__.py exists, move up one level
+            current_path = os.path.dirname(current_path)
+        else:
+            # If no __init__.py, we are not in a module; stop
+            break
+    # If we reached the root without finding a module, return None
+    if os.path.dirname(current_path) == current_path and not os.path.isfile(
+        os.path.join(current_path, "__init__.py")
+    ):
+        return None
+    # Return the path of the top-level module
+    return current_path
+def _filter_internal_frames(traceback, internal_path):
+    """
+    Filter out stack frames from the traceback that belong to the specified module path.
+    This function removes stack frames from the traceback whose file paths start with
+    the given prefix_path, effectively hiding internal implementation details from
+    the error traceback shown to users.
+    """
+    iter_prev = None
+    iter_tb = traceback
+    while iter_tb is not None:
+        if os.path.abspath(iter_tb.tb_frame.f_code.co_filename).startswith(
+            internal_path
+        ):
+            if iter_tb.tb_next:
+                if iter_prev:
+                    iter_prev.tb_next = iter_tb.tb_next
+                else:
+                    traceback = iter_tb.tb_next
+        else:
+            iter_prev = iter_tb
+        iter_tb = iter_tb.tb_next
+    return traceback
+_generated_function_names = re.compile(
+    r"^(loop_body|while_region|while_before_block|while_after_block|if_region|then_block|else_block|elif_region)_\d+$"
+)
+def _filter_duplicated_frames(traceback):
+    """
+    Filter out duplicated stack frames from the traceback.
+    The function filters out consecutive frames that are in the same file and have the same line number.
+    In a sequence of consecutive frames, the logic prefers to keep the non-generated frame or the last frame.
+    """
+    iter_prev = None
+    iter_tb = traceback
+    while iter_tb is not None:
+        skip_current = False
+        skip_next = False
+        if iter_tb.tb_next:
+            current_filename = os.path.abspath(iter_tb.tb_frame.f_code.co_filename)
+            next_filename = os.path.abspath(iter_tb.tb_next.tb_frame.f_code.co_filename)
+            # if in the same file, check if the line number is the same
+            if current_filename == next_filename:
+                current_lineno = iter_tb.tb_lineno
+                next_lineno = iter_tb.tb_next.tb_lineno
+                if current_lineno == next_lineno:
+                    # Same file and line number, check name, if current is generated, skip current, otherwise skip next
+                    name = iter_tb.tb_frame.f_code.co_name
+                    is_generated = bool(_generated_function_names.match(name))
+                    if is_generated:
+                        # Skip current
+                        skip_current = True
+                    else:
+                        # Skip next if it's generated, otherwise keep both
+                        next_name = iter_tb.tb_next.tb_frame.f_code.co_name
+                        skip_next = bool(_generated_function_names.match(next_name))
+        if skip_current:
+            if iter_prev:
+                iter_prev.tb_next = iter_tb.tb_next
+            else:
+                traceback = iter_tb.tb_next
+        elif skip_next:
+            # if next is last frame, don't skip
+            if iter_tb.tb_next.tb_next:
+                iter_tb.tb_next = iter_tb.tb_next.tb_next
+            iter_prev = iter_tb
+        else:
+            iter_prev = iter_tb
+        iter_tb = iter_tb.tb_next
+    return traceback
+def filter_stackframe(traceback, prefix_path):
+    """
+    Filter out stack frames from the traceback that belong to the specified module path.
+    This function removes stack frames from the traceback whose file paths start with
+    the given prefix_path, effectively hiding internal implementation details from
+    the error traceback shown to users.
+    :param traceback: The traceback object to filter.
+    :param prefix_path: The path prefix to filter out from the traceback.
+    :return: The filtered traceback with internal frames removed.
+    """
+    # Step 1: filter internal frames
+    traceback = _filter_internal_frames(traceback, prefix_path)
+    # Step 2: consolidate duplicated frames
+    return _filter_duplicated_frames(traceback)
+def filter_exception(value, module_dir):
+    """
+    Filter out internal implementation details from exception traceback.
+    This function recursively processes an exception and its cause chain,
+    removing stack frames that belong to the specified module directory.
+    This helps to present cleaner error messages to users by hiding
+    implementation details.
+    :param value: The exception object to filter.
+    :param module_dir: The module directory path to filter out from tracebacks.
+    :return: The filtered exception with internal frames removed.
+    """
+    if hasattr(value, "__cause__") and value.__cause__:
+        filter_exception(value.__cause__, module_dir)
+    if hasattr(value, "__traceback__"):
+        filter_stackframe(value.__traceback__, module_dir)

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/timer.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+"""
+This module provides a timing helper functions
+"""
+from functools import wraps
+from .logger import log
+# TODO: revisit this part when mlir timing manager is ready for pybind.
+def timer(*dargs, **kwargs):
+    enable = kwargs.get("enable", True)
+    def decorator(func):
+        @wraps(func)
+        def func_wrapper(*args, **kwargs):
+            if not enable:
+                return func(*args, **kwargs)
+            from time import time
+            start = time()
+            result = func(*args, **kwargs)
+            end = time()
+            # Convert time from seconds to us
+            spend_us = (end - start) * 1e6
+            # Determine the function type and format the log message
+            if hasattr(func, "__name__"):
+                func_name = func.__name__
+                log_message = f"[JIT-TIMER] Function: {func_name} | Execution Time: {spend_us:.2f} µs"
+            elif "CFunctionType" in str(type(func)):
+                log_message = f"[JIT-TIMER] C API Function: {str(func)} | Execution Time: {spend_us:.2f} µs"
+            else:
+                log_message = f"[JIT-TIMER] Anonymous Function | Execution Time: {spend_us:.2f} µs"
+            log().info(log_message)
+            return result
+        return func_wrapper
+    if len(dargs) == 1 and callable(dargs[0]):
+        return decorator(dargs[0])
+    else:
+        return decorator

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/__init__.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from .cutlass_dsl import (
+    Constexpr,
+    as_numeric,
+    min,
+    max,
+    and_,
+    or_,
+    all_,
+    any_,
+    not_,
+    all_,
+    any_,
+    select_,
+    # Control-flow without AST pre-processor
+    if_generate,
+    for_generate,
+    LoopUnroll,
+    while_generate,
+    yield_out,
+    # Control-flow with AST pre-processor
+    range_constexpr,
+    range_dynamic,
+    const_expr,
+    dynamic_expr,
+    # Data types
+    dtype,  # Provides conversions to types inheriting from NumericType
+    DSLRuntimeError,
+    JitArgAdapterRegistry,
+    # Construction utilities for user-defined classes
+    extract_mlir_values,
+    new_from_mlir_values,
+)
+from .cute.typing import *
+# Utilities not belonging to CuTe
+from . import utils as utils
+# Used as internal symbol
+from . import cutlass_dsl as _dsl
+# Aliases
+LaunchConfig = _dsl.BaseDSL.LaunchConfig
+register_jit_arg_adapter = _dsl.JitArgAdapterRegistry.register_jit_arg_adapter
+gpu = _dsl.cutlass_gpu
+cuda = _dsl.cuda_helpers
+CACHE_FILE = "compiled_cache.db"

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/__init__.py ADDED Viewed

	@@ -0,0 +1,319 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+# Use the auto-generated enum AddressSpace
+from cutlass._mlir.dialects.cute import AddressSpace
+# Explicitly import types that might be directly used by other modules.
+# This is a fix for using Sphinx to generate documentation
+# Because Sphinx processes each module in isolation, it won't be able to rely
+# on re-exported symbols via wildcard imports (from .typing import *) in the
+# same way that Python does at runtime.
+from .typing import (
+    Shape,
+    Stride,
+    IntTuple,
+    Coord,
+    Tile,
+    XTuple,
+    Tiler,
+    Layout,
+    Pointer,
+    Tensor,
+)
+# Import everything else
+from .typing import *
+from .core import (
+    assume,
+    is_integer,
+    is_int_tuple,
+    is_static,
+    size,
+    has_underscore,
+    slice_,
+    make_ptr,
+    make_layout,
+    recast_layout,
+    make_fragment_like,
+    depth,
+    rank,
+    flatten_to_tuple,
+    flatten,
+    unflatten,
+    product,
+    product_like,
+    shape,
+    size_in_bytes,
+    make_identity_layout,
+    make_ordered_layout,
+    make_composed_layout,
+    make_layout_tv,
+    make_swizzle,
+    recast_ptr,
+    make_tensor,
+    make_identity_tensor,
+    make_fragment,
+    recast_tensor,
+    get,
+    select,
+    front,
+    is_major,
+    leading_dim,
+    find,
+    find_if,
+    coalesce,
+    group_modes,
+    cosize,
+    dice,
+    product_each,
+    prepend,
+    append,
+    prepend_ones,
+    append_ones,
+    ceil_div,
+    slice_and_offset,
+    crd2idx,
+    domain_offset,
+    elem_less,
+    transform_leaf,
+    filter_zeros,
+    filter,
+    tile_to_shape,
+    shape_div,
+    composition,
+    complement,
+    right_inverse,
+    left_inverse,
+    max_common_layout,
+    max_common_vector,
+    logical_product,
+    zipped_product,
+    tiled_product,
+    flat_product,
+    raked_product,
+    blocked_product,
+    flat_divide,
+    logical_divide,
+    zipped_divide,
+    tiled_divide,
+    local_partition,
+    local_tile,
+    printf,
+    print_tensor,
+    # tiled mma/tiled copy
+    make_mma_atom,
+    make_tiled_mma,
+    make_copy_atom,
+    make_tiled_copy_tv,
+    make_tiled_copy,
+    make_tiled_copy_S,
+    make_tiled_copy_D,
+    make_tiled_copy_A,
+    make_tiled_copy_B,
+    make_tiled_copy_C,
+    make_tiled_copy_C_atom,
+    basic_copy,
+    basic_copy_if,
+    autovec_copy,
+    copy,
+    copy_atom_call,
+    gemm,
+    # Wrapper classes
+    ComposedLayout,
+    Swizzle,
+    E,
+    Atom,
+    MmaAtom,
+    CopyAtom,
+    TiledCopy,
+    TiledMma,
+    TensorSSA,
+    ReductionOp,
+    full,
+    full_like,
+    empty_like,
+    ones_like,
+    zeros_like,
+    where,
+    any_,
+    all_,
+    # User defined struct
+    struct,
+    pretty_str,
+    make_layout_image_mask,
+    repeat_like,
+    round_up,
+    is_congruent,
+    is_weakly_congruent,
+    ScaledBasis,
+    get_divisibility,
+    Ratio,
+)
+from . import arch
+from . import nvgpu
+from . import testing
+from . import runtime
+# Export all math ops without "math."
+from .math import *
+# Used as internal symbol
+from .. import cutlass_dsl as _dsl
+# Aliases
+jit = _dsl.CuTeDSL.jit
+kernel = _dsl.CuTeDSL.kernel
+register_jit_arg_adapter = _dsl.JitArgAdapterRegistry.register_jit_arg_adapter
+compile = _dsl.compile
+# Explicitly export all symbols for documentation generation
+__all__ = [
+    # Core types
+    "AddressSpace",
+    "Tensor",
+    "Layout",
+    "ComposedLayout",
+    "Swizzle",
+    "E",
+    "Atom",
+    "MmaAtom",
+    "CopyAtom",
+    "TiledCopy",
+    "TiledMma",
+    "TensorSSA",
+    # Basic utility functions
+    "assume",
+    "is_integer",
+    "is_int_tuple",
+    "is_static",
+    "size",
+    "has_underscore",
+    "slice_",
+    "depth",
+    "rank",
+    "shape",
+    "printf",
+    "print_tensor",
+    "pretty_str",
+    # Layout functions
+    "make_layout",
+    "recast_layout",
+    "make_identity_layout",
+    "make_ordered_layout",
+    "make_composed_layout",
+    "make_layout_tv",
+    "make_layout_image_mask",
+    # Tensor functions
+    "make_ptr",
+    "make_tensor",
+    "make_identity_tensor",
+    "make_fragment",
+    "make_fragment_like",
+    "recast_ptr",
+    "recast_tensor",
+    # Tensor manipulation
+    "get",
+    "select",
+    "front",
+    "is_major",
+    "leading_dim",
+    "find",
+    "find_if",
+    "coalesce",
+    "group_modes",
+    "cosize",
+    "size_in_bytes",
+    # Tuple operations
+    "flatten_to_tuple",
+    "flatten",
+    "product",
+    "product_like",
+    "product_each",
+    "prepend",
+    "append",
+    "prepend_ones",
+    "append_ones",
+    # Math operations
+    "ceil_div",
+    "round_up",
+    # Layout operations
+    "slice_and_offset",
+    "crd2idx",
+    "domain_offset",
+    "elem_less",
+    "filter_zeros",
+    "filter",
+    "tile_to_shape",
+    "shape_div",
+    "dice",
+    # Layout algebra
+    "composition",
+    "complement",
+    "right_inverse",
+    "left_inverse",
+    "max_common_layout",
+    "max_common_vector",
+    "is_congruent",
+    "is_weakly_congruent",
+    # Product operations
+    "logical_product",
+    "zipped_product",
+    "tiled_product",
+    "flat_product",
+    "raked_product",
+    "blocked_product",
+    # Division operations
+    "flat_divide",
+    "logical_divide",
+    "zipped_divide",
+    "tiled_divide",
+    "local_partition",
+    "local_tile",
+    # MMA and Copy operations
+    "make_mma_atom",
+    "make_tiled_mma",
+    "make_copy_atom",
+    "make_tiled_copy_tv",
+    "make_tiled_copy",
+    "make_tiled_copy_C_atom",
+    "basic_copy",
+    "basic_copy_if",
+    "autovec_copy",
+    "copy",
+    "copy_atom_call",
+    "gemm",
+    # Tensor creation
+    "full",
+    "full_like",
+    "empty_like",
+    "ones_like",
+    "zeros_like",
+    "where",
+    "any_",
+    "all_",
+    "repeat_like",
+    "ScaledBasis",
+    # User defined struct
+    "struct",
+    # Modules
+    "arch",
+    "nvgpu",
+    "testing",
+    "runtime",
+    # Decorators and code generation
+    "jit",
+    "kernel",
+    "register_jit_arg_adapter",
+    "compile",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/__init__.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from .elect import *
+from .mbar import *
+from .nvvm_wrappers import *
+from .smem import *
+from .tmem import *
+# __all__ is required here for documentation generation
+__all__ = [
+    #
+    # elect.py
+    #
+    "make_warp_uniform",
+    "elect_one",
+    #
+    # mbar.py
+    #
+    "mbarrier_init",
+    "mbarrier_init_fence",
+    "mbarrier_arrive_and_expect_tx",
+    "mbarrier_expect_tx",
+    "mbarrier_wait",
+    "mbarrier_try_wait",
+    "mbarrier_conditional_try_wait",
+    "mbarrier_arrive",
+    #
+    # nvvm_wrappers.py
+    #
+    "lane_idx",
+    "warp_idx",
+    "thread_idx",
+    "block_dim",
+    "block_idx",
+    "grid_dim",
+    "cluster_idx",
+    "cluster_dim",
+    "block_in_cluster_idx",
+    "block_in_cluster_dim",
+    "block_idx_in_cluster",
+    "shuffle_sync",
+    "shuffle_sync_up",
+    "shuffle_sync_down",
+    "shuffle_sync_bfly",
+    "barrier",
+    "barrier_arrive",
+    "sync_threads",
+    "sync_warp",
+    "fence_acq_rel_cta",
+    "fence_acq_rel_cluster",
+    "fence_acq_rel_gpu",
+    "fence_acq_rel_sys",
+    "cp_async_commit_group",
+    "cp_async_wait_group",
+    "cp_async_bulk_commit_group",
+    "cp_async_bulk_wait_group",
+    "cluster_wait",
+    "cluster_arrive",
+    "cluster_arrive_relaxed",
+    "fence_proxy",
+    "vote_ballot_sync",
+    "popc",
+    "fence_view_async_tmem_load",
+    "fence_view_async_tmem_store",
+    "warpgroup_reg_alloc",
+    "warpgroup_reg_dealloc",
+    "fma_packed_f32x2",
+    "mul_packed_f32x2",
+    "add_packed_f32x2",
+    "fmax",
+    "rcp_approx",
+    "exp2",
+    # Constants
+    "WARP_SIZE",
+    # Forward from auto-generated nvvm python
+    "ProxyKind",
+    "SharedSpace",
+    "RoundingModeKind",
+    #
+    # smem.py
+    #
+    "alloc_smem",
+    "get_dyn_smem",
+    "get_dyn_smem_size",
+    #
+    # tmem.py
+    #
+    "retrieve_tmem_ptr",
+    "alloc_tmem",
+    "relinquish_tmem_alloc_permit",
+    "dealloc_tmem",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/elect.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from cutlass.cutlass_dsl import CuTeDSL, T, dsl_user_op
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir.dialects import nvvm, scf
+from cutlass._mlir import ir
+from ..typing import Int, Int32
+from ...impl_utils import check_value_in
+@dsl_user_op
+def make_warp_uniform(value: Int, *, loc=None, ip=None) -> Int32:
+    """
+    Creates a warp-uniform value from the given integer input.
+    :param value: The integer to make warp uniform.
+    :type value:  Int
+    :return:      The warp-uniform value equal to the input.
+    :rtype:       Int32
+    """
+    return Int32(
+        _cute_nvgpu_ir.arch_make_warp_uniform(
+            Int32(value).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+    )
+class IfOpRegion:
+    """
+    A context manager for if Op.
+    Automatically inserts `scf.yield([])` when exiting the context.
+    """
+    def __init__(self, block, *, loc=None, ip=None):
+        self.block = block
+        self.insert_point = ir.InsertionPoint(self.block)
+        self.loc = loc
+        self.ip = ip
+    def __enter__(self):
+        self.insert_point.__enter__()
+        return self.block.arguments
+    def __exit__(self, exc_type, exc_value, traceback):
+        scf.yield_([], loc=self.loc, ip=self.ip)
+        self.insert_point.__exit__(exc_type, exc_value, traceback)
+@dsl_user_op
+def elect_one(*, loc=None, ip=None) -> IfOpRegion:
+    """
+    Elects one thread within a warp.
+    .. code-block:: python
+        with elect_one():
+            # Only one thread in the warp executes the code in this context
+            pass
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+    is_thread_leader = nvvm.elect_sync(T.bool())
+    if_op = scf.IfOp(is_thread_leader, loc=loc, ip=ip)
+    return IfOpRegion(if_op.then_block, loc=loc, ip=ip)

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/mbar.py ADDED Viewed

	@@ -0,0 +1,349 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from typing import Optional
+from cutlass.cutlass_dsl import CuTeDSL, T, if_generate, dsl_user_op
+from cutlass._mlir.dialects import nvvm
+from cutlass._mlir import ir
+from ..typing import Pointer, Int, Boolean, Int32
+from ...impl_utils import check_value_in
+####################################################################################################
+#
+# Mbarrier management utilities
+#
+####################################################################################################
+@dsl_user_op
+def mbarrier_init(mbar_ptr: Pointer, cnt: Int, *, loc=None, ip=None) -> None:
+    """
+    Initializes a mbarrier with the specified thread arrival count.
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param cnt:      The arrival count of the mbarrier
+    :type cnt:       Int
+    """
+    nvvm.mbarrier_init_shared(
+        mbar_ptr.llvm_ptr, Int32(cnt).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+    )
+@dsl_user_op
+def mbarrier_init_fence(*, loc=None, ip=None) -> None:
+    """
+    A fence operation that applies to the mbarrier initializations.
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+    nvvm.fence_mbarrier_init(loc=loc, ip=ip)
+@dsl_user_op
+def mbarrier_arrive_and_expect_tx(
+    mbar_ptr: Pointer, bytes: Int, peer_cta_rank_in_cluster=None, *, loc=None, ip=None
+) -> None:
+    """
+    Arrives on a mbarrier and expects a specified number of transaction bytes.
+    :param mbar_ptr:                 A pointer to the mbarrier in SMEM
+    :type mbar_ptr:                  Pointer
+    :param bytes:                    The number of transaction bytes
+    :type bytes:                     Int
+    :param peer_cta_rank_in_cluster: An optional CTA rank in cluster. If provided, the pointer to
+                                     the mbarrier is converted to a remote address in the peer CTA's
+                                     SMEM.
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    if peer_cta_rank_in_cluster is not None:
+        mbar_llvm_ptr = nvvm.mapa_shared_cluster(
+            mbar_llvm_ptr.type,
+            mbar_llvm_ptr,
+            Int32(peer_cta_rank_in_cluster).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+        space = nvvm.MBarrierSpaceKind.CLUSTER
+    else:
+        space = nvvm.MBarrierSpaceKind.CTA
+    nvvm.mbarrier_txn(
+        mbar_llvm_ptr,
+        Int32(bytes).ir_value(loc=loc, ip=ip),
+        kind=nvvm.MBarrierTxnKind.ARRIVE_EXPECT_TX,
+        space=space,
+        loc=loc,
+        ip=ip,
+    )
+@dsl_user_op
+def mbarrier_expect_tx(
+    mbar_ptr: Pointer, bytes: Int, peer_cta_rank_in_cluster=None, *, loc=None, ip=None
+) -> None:
+    """
+    Expects a specified number of transaction bytes without an arrive.
+    :param mbar_ptr:                 A pointer to the mbarrier in SMEM
+    :type mbar_ptr:                  Pointer
+    :param bytes:                    The number of transaction bytes
+    :type bytes:                     Int
+    :param peer_cta_rank_in_cluster: An optional CTA rank in cluster. If provided, the pointer to
+                                     the mbarrier is converted to a remote address in the peer CTA's
+                                     SMEM.
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    if peer_cta_rank_in_cluster is not None:
+        mbar_llvm_ptr = nvvm.mapa(
+            mbar_llvm_ptr.type,
+            mbar_llvm_ptr,
+            Int32(peer_cta_rank_in_cluster).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+        space = nvvm.MBarrierSpaceKind.CLUSTER
+    else:
+        space = nvvm.MBarrierSpaceKind.CTA
+    nvvm.mbarrier_txn(
+        mbar_llvm_ptr,
+        Int32(bytes).ir_value(loc=loc, ip=ip),
+        kind=nvvm.MBarrierTxnKind.EXPECT_TX,
+        space=space,
+        loc=loc,
+        ip=ip,
+    )
+@dsl_user_op
+def mbarrier_wait(mbar_ptr: Pointer, phase: Int, *, loc=None, ip=None) -> None:
+    """
+    Waits on a mbarrier with a specified phase.
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param phase:    The phase to wait for (either 0 or 1)
+    :type phase:     Int
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+    timeout_ns = 10000000
+    # This NVVM Op is a spin-loop wrapping the mbarrier.try_wait.parity.shared.b64 PTX
+    # The timeout in ns only applies to the latter and this call is truly blocking
+    nvvm.mbarrier_try_wait_parity_shared(
+        mbar_ptr.llvm_ptr,
+        Int32(phase).ir_value(loc=loc, ip=ip),
+        Int32(timeout_ns).ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+@dsl_user_op
+def mbarrier_try_wait(mbar_ptr: Pointer, phase: Int, *, loc=None, ip=None) -> Boolean:
+    """
+    Attempts to wait on a mbarrier with a specified phase in a non-blocking fashion.
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param phase:    The phase to wait for (either 0 or 1)
+    :type phase:     Int
+    :return:         A boolean value indicating whether the wait operation was successful
+    :rtype:          Boolean
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+    return Boolean(
+        nvvm.mbarrier_wait_parity(
+            T.bool(),
+            mbar_ptr.llvm_ptr,
+            Int32(phase).ir_value(loc=loc, ip=ip),
+            nvvm.MBarrierWaitKind.TRY,
+            loc=loc,
+            ip=ip,
+        )
+    )
+@dsl_user_op
+def mbarrier_conditional_try_wait(
+    cond, mbar_ptr: Pointer, phase: Int, *, loc=None, ip=None
+) -> Boolean:
+    """
+    Conditionally attempts to wait on a mbarrier with a specified phase in a non-blocking fashion.
+    :param cond:     A boolean predicate
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param phase:    The phase to wait for (either 0 or 1)
+    :type phase:     Int
+    :return:         A boolean value indicating whether the wait operation was successful
+    :rtype:          Boolean
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+    return if_generate(
+        cond,
+        lambda: mbarrier_try_wait(mbar_ptr, phase, loc=loc, ip=ip),
+        lambda: Boolean(True).ir_value(loc=loc, ip=ip),
+        None,
+        [Boolean],
+    )
+@dsl_user_op
+def mbarrier_arrive(
+    mbar_ptr: Pointer,
+    peer_cta_rank_in_cluster: Optional[Int] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Arrives on an mbarrier.
+    :param mbar_ptr:                 A pointer to the mbarrier in SMEM
+    :type mbar_ptr:                  Pointer
+    :param peer_cta_rank_in_cluster: An optional CTA rank in cluster. If provided, the pointer to
+                                     the mbarrier is converted to a remote address in the peer CTA's
+                                     SMEM.
+    """
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    if peer_cta_rank_in_cluster is not None:
+        arch = CuTeDSL._get_dsl().envar.arch
+        check_value_in(
+            arch,
+            [
+                "sm_90",
+                "sm_90a",
+                "sm_100a",
+                "sm_100f",
+            ],
+            "arch",
+        )
+        mbar_llvm_ptr = nvvm.mapa_shared_cluster(
+            mbar_llvm_ptr.type,
+            mbar_llvm_ptr,
+            Int32(peer_cta_rank_in_cluster).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+        space = nvvm.MBarrierSpaceKind.CLUSTER
+    else:
+        space = nvvm.MBarrierSpaceKind.CTA
+    nvvm.mbarrier_txn(
+        mbar_llvm_ptr,
+        Int32(1).ir_value(loc=loc, ip=ip),
+        kind=nvvm.MBarrierTxnKind.ARRIVE,
+        space=space,
+        loc=loc,
+        ip=ip,
+    )
+@dsl_user_op
+def cp_async_mbarrier_arrive_noinc(mbar_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """
+    Arrives on an mbarrier for async load **without incrementing** the arrival count
+    (`cp.async.mbarrier.arrive.shared ..., noinc=1`).
+    Used in the warp-specialized kernel when the non-TMA load warp(producer) is not the same
+    as the math/epilogue warp(consumer).
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    nvvm.cp_async_mbarrier_arrive_shared(
+        mbar_llvm_ptr,
+        noinc=True,
+        loc=loc,
+        ip=ip,
+    )

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py ADDED Viewed

	@@ -0,0 +1,681 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from functools import partial
+from typing import Optional, Tuple, Union, Callable
+from typing_extensions import deprecated
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir import ir
+from cutlass._mlir.dialects import llvm, nvvm, vector
+# Forward nvvm enums
+from cutlass._mlir.dialects.nvvm import (
+    ProxyKind,
+    SharedSpace,
+    Tcgen05WaitKind,
+    SetMaxRegisterAction,
+    RoundingModeKind,
+)
+from ..typing import (
+    Int,
+    Boolean,
+    Int16,
+    Uint16,
+    Int32,
+    Uint32,
+    Int64,
+    Float32,
+    BFloat16,
+    Numeric,
+    as_numeric,
+)
+WARP_SIZE = 32
+FULL_MASK = 0xFFFFFFFF
+@dsl_user_op
+def lane_idx(*, loc=None, ip=None) -> Int32:
+    """
+    Returns the lane index of the current thread within the warp.
+    """
+    return Int32(nvvm.read_ptx_sreg_laneid(T.i32(), loc=loc, ip=ip))
+@dsl_user_op
+def warp_idx(*, loc=None, ip=None) -> Int32:
+    """
+    Returns the warp index within a CTA.
+    """
+    warp_size = 32
+    tid_x = Int32(nvvm.read_ptx_sreg_tid_x(T.i32(), loc=loc, ip=ip))
+    tid_y = Int32(nvvm.read_ptx_sreg_tid_y(T.i32(), loc=loc, ip=ip))
+    tid_z = Int32(nvvm.read_ptx_sreg_tid_z(T.i32(), loc=loc, ip=ip))
+    ntid_x = Int32(nvvm.read_ptx_sreg_ntid_x(T.i32(), loc=loc, ip=ip))
+    ntid_y = Int32(nvvm.read_ptx_sreg_ntid_y(T.i32(), loc=loc, ip=ip))
+    tid = tid_x + tid_y * ntid_x + tid_z * ntid_x * ntid_y
+    return tid // warp_size
+@dsl_user_op
+def thread_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the thread index within a CTA.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_tid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_tid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_tid_z(T.i32(), loc=loc, ip=ip)),
+    )
+@dsl_user_op
+def block_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the number of threads in each dimension of the CTA.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_ntid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_ntid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_ntid_z(T.i32(), loc=loc, ip=ip)),
+    )
+@dsl_user_op
+def block_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the CTA identifier within a grid.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_ctaid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_ctaid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_ctaid_z(T.i32(), loc=loc, ip=ip)),
+    )
+@dsl_user_op
+def grid_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the number of CTAs in each dimension of the grid.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_nctaid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_nctaid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_nctaid_z(T.i32(), loc=loc, ip=ip)),
+    )
+@dsl_user_op
+def cluster_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the cluster identifier within a grid.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_clusterid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_clusterid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_clusterid_z(T.i32(), loc=loc, ip=ip)),
+    )
+@dsl_user_op
+def cluster_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the number of clusters in each dimension of the grid.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_nclusterid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_nclusterid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_nclusterid_z(T.i32(), loc=loc, ip=ip)),
+    )
+@dsl_user_op
+def block_in_cluster_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the CTA index within a cluster across all dimensions.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_cluster_ctaid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_cluster_ctaid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_cluster_ctaid_z(T.i32(), loc=loc, ip=ip)),
+    )
+@dsl_user_op
+def block_in_cluster_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the dimensions of the cluster.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_cluster_nctaid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_cluster_nctaid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_cluster_nctaid_z(T.i32(), loc=loc, ip=ip)),
+    )
+@dsl_user_op
+def block_idx_in_cluster(*, loc=None, ip=None) -> Int32:
+    """
+    Returns the linearized identifier of the CTA within the cluster.
+    """
+    return Int32(nvvm.read_ptx_sreg_cluster_ctarank(T.i32(), loc=loc, ip=ip))
+@dsl_user_op
+def shuffle_sync_op(
+    value: Numeric,
+    offset: Int,
+    mask: Int = FULL_MASK,
+    mask_and_clamp: Int = WARP_SIZE - 1,
+    kind: nvvm.ShflKind = nvvm.ShflKind.idx,
+    *,
+    loc=None,
+    ip=None,
+) -> Numeric:
+    """
+    Shuffles a value within the threads of a warp.
+    :param value:          The value to shuffle
+    :type value:           Numeric
+    :param mask:           A mask describing the threads participating in this operation
+    :type mask:            Int
+    :param offset:         A source lane or a source lane offset depending on kind
+    :type offset:          Int
+    :param mask_and_clamp: An integer containing two packed values specifying a mask for logically
+                           splitting warps into sub-segments and an upper bound for clamping the
+                           source lane index.
+    :type mask_and_clamp:  Int
+    :param kind:           The kind of shuffle, can be idx, up, down, or bfly
+    :type kind:            ShflKind
+    :return:               The shuffled value
+    :rtype:                Numeric
+    """
+    if not isinstance(value, Numeric):
+        value = as_numeric(value)
+    if value.width > 64:
+        raise ValueError("shuffle_sync only supports values up to 64 bits")
+    orig_type = type(value)
+    if value.width < 32:
+        if value.dtype.is_float:
+            value = value.to(Float32)
+        else:
+            if value.signed:
+                value = value.to(Int32)
+            else:
+                value = value.to(Uint32)
+        return orig_type(
+            nvvm.shfl_sync(
+                type(value).mlir_type,
+                Int32(mask).ir_value(loc=loc, ip=ip),
+                value.ir_value(loc=loc, ip=ip),
+                Int32(offset).ir_value(loc=loc, ip=ip),
+                Int32(mask_and_clamp).ir_value(loc=loc, ip=ip),
+                kind,
+                loc=loc,
+                ip=ip,
+            )
+        )
+    elif value.width == 32:
+        return orig_type(
+            nvvm.shfl_sync(
+                type(value).mlir_type,
+                Int32(mask).ir_value(loc=loc, ip=ip),
+                value.ir_value(loc=loc, ip=ip),
+                Int32(offset).ir_value(loc=loc, ip=ip),
+                Int32(mask_and_clamp).ir_value(loc=loc, ip=ip),
+                kind,
+                loc=loc,
+                ip=ip,
+            )
+        )
+    else:
+        if value.width != 64:
+            raise ValueError(
+                "shuffle_sync only supports 64 bits values when the bit width is larger than 32"
+            )
+        value = llvm.bitcast(
+            T.i64(), value.to(ir.Value, loc=loc, ip=ip), loc=loc, ip=ip
+        )
+        # extract low 32 bits
+        low_32_bits = llvm.trunc(
+            T.i32(), value, llvm.IntegerOverflowFlags.none, loc=loc, ip=ip
+        )
+        # extract high 32 bits
+        high_32_bits = llvm.lshr(
+            value, Int64(32).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+        high_32_bits = llvm.trunc(
+            T.i32(), high_32_bits, llvm.IntegerOverflowFlags.none, loc=loc, ip=ip
+        )
+        low_32_bits_shfl = nvvm.shfl_sync(
+            T.i32(),
+            Int32(mask).ir_value(loc=loc, ip=ip),
+            low_32_bits,
+            Int32(offset).ir_value(loc=loc, ip=ip),
+            Int32(mask_and_clamp).ir_value(loc=loc, ip=ip),
+            kind,
+            loc=loc,
+            ip=ip,
+        )
+        high_32_bits_shfl = nvvm.shfl_sync(
+            T.i32(),
+            Int32(mask).ir_value(loc=loc, ip=ip),
+            high_32_bits,
+            Int32(offset).ir_value(loc=loc, ip=ip),
+            Int32(mask_and_clamp).ir_value(loc=loc, ip=ip),
+            kind,
+            loc=loc,
+            ip=ip,
+        )
+        # combine low and high 32 bits
+        low_64_bit = llvm.zext(T.i64(), low_32_bits_shfl, loc=loc, ip=ip)
+        high_64_bit = llvm.zext(T.i64(), high_32_bits_shfl, loc=loc, ip=ip)
+        shlf_res = llvm.shl(
+            high_64_bit,
+            Int64(32).ir_value(loc=loc, ip=ip),
+            llvm.IntegerOverflowFlags.none,
+            loc=loc,
+            ip=ip,
+        )
+        shlf_res = llvm.or_(shlf_res, low_64_bit, loc=loc, ip=ip)
+        shlf_res = llvm.bitcast(orig_type.mlir_type, shlf_res, loc=loc, ip=ip)
+        return orig_type(shlf_res)
+shuffle_sync = partial(shuffle_sync_op, kind=nvvm.ShflKind.idx)
+shuffle_sync_up = partial(shuffle_sync_op, kind=nvvm.ShflKind.up)
+shuffle_sync_down = partial(shuffle_sync_op, kind=nvvm.ShflKind.down)
+shuffle_sync_bfly = partial(shuffle_sync_op, kind=nvvm.ShflKind.bfly)
+@dsl_user_op
+def barrier(*, barrier_id=None, number_of_threads=None, loc=None, ip=None) -> None:
+    """
+    Creates a barrier, optionally named.
+    """
+    if barrier_id is not None:
+        barrier_id = Int32(barrier_id).ir_value(loc=loc, ip=ip)
+    if number_of_threads is not None:
+        number_of_threads = Int32(number_of_threads).ir_value(loc=loc, ip=ip)
+    nvvm.barrier(
+        barrier_id=barrier_id, number_of_threads=number_of_threads, loc=loc, ip=ip
+    )
+@dsl_user_op
+def barrier_arrive(
+    *, barrier_id=None, number_of_threads=None, loc=None, ip=None
+) -> None:
+    if barrier_id is not None:
+        barrier_id = Int32(barrier_id).ir_value(loc=loc, ip=ip)
+    if number_of_threads is None:
+        raise ValueError(
+            "barrier_arrive needs pass number_of_threads to arrive the barrier",
+        )
+    number_of_threads = Int32(number_of_threads).ir_value(loc=loc, ip=ip)
+    nvvm.barrier_arrive(
+        barrier_id=barrier_id, number_of_threads=number_of_threads, loc=loc, ip=ip
+    )
+@dsl_user_op
+def sync_threads(*, loc=None, ip=None) -> None:
+    """
+    Synchronizes all threads within a CTA.
+    """
+    nvvm.barrier(loc=loc, ip=ip)
+@dsl_user_op
+def sync_warp(mask: Int = FULL_MASK, *, loc=None, ip=None) -> None:
+    """
+    Performs a warp-wide sync with an optional mask.
+    """
+    nvvm.bar_warp_sync(Int32(mask).ir_value(loc=loc, ip=ip), loc=loc, ip=ip)
+@dsl_user_op
+def fence_acq_rel_cta(*, loc=None, ip=None) -> None:
+    """
+    Fence operation with acquire-release semantics.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    nvvm.fence_acq_rel_cta(loc=loc, ip=ip)
+@dsl_user_op
+def fence_acq_rel_cluster(*, loc=None, ip=None) -> None:
+    """
+    Fence operation with acquire-release semantics.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    nvvm.fence_acq_rel_cluster(loc=loc, ip=ip)
+@dsl_user_op
+def fence_acq_rel_gpu(*, loc=None, ip=None) -> None:
+    """
+    Fence operation with acquire-release semantics.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    nvvm.fence_acq_rel_gpu(loc=loc, ip=ip)
+@dsl_user_op
+def fence_acq_rel_sys(*, loc=None, ip=None) -> None:
+    """
+    Fence operation with acquire-release semantics.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    nvvm.fence_acq_rel_sys(loc=loc, ip=ip)
+@dsl_user_op
+def cp_async_commit_group(*, loc=None, ip=None) -> None:
+    """
+    Commits all prior initiated but uncommitted cp.async instructions.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-commit-group>`__.
+    """
+    nvvm.cp_async_commit_group(loc=loc, ip=ip)
+@dsl_user_op
+def cp_async_wait_group(n, *, loc=None, ip=None) -> None:
+    """
+    Waits till only a specified numbers of cp.async groups are pending.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all>`__.
+    """
+    nvvm.cp_async_wait_group(n, loc=loc, ip=ip)
+@dsl_user_op
+def cp_async_bulk_commit_group(*, loc=None, ip=None) -> None:
+    """
+    Commits all prior initiated but uncommitted cp.async.bulk instructions.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-commit-group>`__.
+    """
+    nvvm.cp_async_bulk_commit_group(loc=loc, ip=ip)
+@dsl_user_op
+def cp_async_bulk_wait_group(group, *, read=None, loc=None, ip=None) -> None:
+    """
+    Waits till only a specified numbers of cp.async.bulk groups are pending.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-wait-group>`__.
+    """
+    nvvm.cp_async_bulk_wait_group(group, read=read, loc=loc, ip=ip)
+@dsl_user_op
+def cluster_wait(*, loc=None, ip=None) -> None:
+    """
+    A cluster-wide wait operation.
+    """
+    nvvm.cluster_wait(loc=loc, ip=ip)
+@dsl_user_op
+def cluster_arrive(*, aligned=None, loc=None, ip=None) -> None:
+    """
+    A cluster-wide arrive operation.
+    """
+    nvvm.cluster_arrive(aligned=aligned, loc=loc, ip=ip)
+@dsl_user_op
+def cluster_arrive_relaxed(*, aligned=None, loc=None, ip=None) -> None:
+    """
+    A cluster-wide arrive operation with relaxed semantics.
+    """
+    nvvm.cluster_arrive_relaxed(aligned=aligned, loc=loc, ip=ip)
+@dsl_user_op
+def fence_proxy(
+    kind: ProxyKind,
+    *,
+    space: Optional[SharedSpace] = None,
+    use_intrinsic=None,
+    loc=None,
+    ip=None,
+) -> None:
+    nvvm.fence_proxy(
+        kind=kind, space=space, use_intrinsic=use_intrinsic, loc=loc, ip=ip
+    )
+@dsl_user_op
+def vote_ballot_sync(
+    pred: Boolean, mask: Int = FULL_MASK, *, loc=None, ip=None
+) -> Int32:
+    """
+    Performs a ballot operation across the warp.
+    """
+    return Int32(
+        nvvm.vote_ballot_sync(
+            T.i32(),
+            Int32(mask).ir_value(loc=loc, ip=ip),
+            Boolean(pred).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    )
+@dsl_user_op
+def popc(value: Numeric, *, loc=None, ip=None) -> Numeric:
+    """
+    Performs a population count operation.
+    """
+    if not isinstance(value, Numeric):
+        value = as_numeric(value)
+    return type(value)(llvm.intr_ctpop(value.ir_value(), loc=loc, ip=ip))
+@dsl_user_op
+def fence_view_async_tmem_op(
+    kind: Tcgen05WaitKind,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Perform a fence operation on the async TMEM load or store.
+    .. note::
+        This function is only available on sm_100a and above.
+        The fence is required to synchronize the TMEM load/store
+        and let the pipeline release or commit the buffer.
+        Take a mma2acc pipeline as an example of LOAD fence, the ACC tensor is from TMEM.
+        ```
+        # Start to copy ACC from TMEM to register
+        cute.copy(tmem_load, tACC, rACC)
+        fence_view_async_tmem_load()
+        # After fence, we can ensure the TMEM buffer is consumed totally.
+        # Release the buffer to let the MMA know it can overwrite the buffer.
+        mma2accum_pipeline.consumer_release(curr_consumer_state)
+        ```
+        Take a TS GEMM kernel as an example of STORE fence, the A tensor is from TMEM.
+        ```
+        # Start to copy A from register to TMEM
+        cute.copy(tmem_store, rA, tA)
+        fence_view_async_tmem_store()
+        # After fence, we can ensure the TMEM buffer is ready.
+        # Commit the buffer to let the MMA know it can start to load A.
+        tmem_mma_pipeline.producer_commit(curr_producer_state)
+        ```
+    :param kind: The kind of fence operation to perform including LOAD and STORE.
+    :type kind: Tcgen05WaitKind
+    """
+    nvvm.tcgen05_wait(kind, loc=loc, ip=ip)
+fence_view_async_tmem_load = partial(
+    fence_view_async_tmem_op, kind=Tcgen05WaitKind.LOAD
+)
+fence_view_async_tmem_store = partial(
+    fence_view_async_tmem_op, kind=Tcgen05WaitKind.STORE
+)
+@dsl_user_op
+def warpgroup_reg_realloc_op(
+    reg_count: int,
+    kind: SetMaxRegisterAction,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    nvvm.setmaxregister(reg_count, kind, loc=loc, ip=ip)
+warpgroup_reg_alloc = partial(
+    warpgroup_reg_realloc_op, kind=SetMaxRegisterAction.increase
+)
+warpgroup_reg_dealloc = partial(
+    warpgroup_reg_realloc_op, kind=SetMaxRegisterAction.decrease
+)
+@dsl_user_op
+def calc_packed_f32x2_op(
+    src_a: Tuple[Float32, Float32],
+    src_b: Tuple[Float32, Float32],
+    src_c: Tuple[Float32, Float32] | None,
+    calc_func: Callable,
+    *,
+    rnd=RoundingModeKind.RZ,
+    ftz=True,
+    loc=None,
+    ip=None,
+) -> Tuple[Float32, Float32]:
+    vec_type = ir.VectorType.get([2], Float32.mlir_type, loc=loc)
+    vec_src_a = vector.from_elements(
+        vec_type, tuple(as_numeric(a).ir_value() for a in src_a), loc=loc, ip=ip
+    )
+    vec_src_b = vector.from_elements(
+        vec_type, tuple(as_numeric(b).ir_value() for b in src_b), loc=loc, ip=ip
+    )
+    if src_c is not None:
+        vec_src_c = vector.from_elements(
+            vec_type, tuple(as_numeric(c).ir_value() for c in src_c), loc=loc, ip=ip
+        )
+        vec_res = calc_func(
+            vec_type, vec_src_a, vec_src_b, vec_src_c, rnd=rnd, ftz=ftz, loc=loc, ip=ip
+        )
+    else:
+        vec_res = calc_func(
+            vec_type, vec_src_a, vec_src_b, rnd=rnd, ftz=ftz, loc=loc, ip=ip
+        )
+    res0 = Float32(
+        vector.extract(
+            vec_res, dynamic_position=[], static_position=[0], loc=loc, ip=ip
+        )
+    )
+    res1 = Float32(
+        vector.extract(
+            vec_res, dynamic_position=[], static_position=[1], loc=loc, ip=ip
+        )
+    )
+    return res0, res1
+fma_packed_f32x2 = partial(calc_packed_f32x2_op, calc_func=nvvm.fma_packed_f32x2)
+mul_packed_f32x2 = partial(
+    calc_packed_f32x2_op, src_c=None, calc_func=nvvm.mul_packed_f32x2
+)
+add_packed_f32x2 = partial(
+    calc_packed_f32x2_op, src_c=None, calc_func=nvvm.add_packed_f32x2
+)
+@dsl_user_op
+def fmax(
+    a: Union[float, Float32], b: Union[float, Float32], *, loc=None, ip=None
+) -> Float32:
+    return Float32(
+        nvvm.fmax(
+            T.f32(),
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    )
+@dsl_user_op
+def rcp_approx(a: Union[float, Float32], *, loc=None, ip=None):
+    return Float32(
+        nvvm.rcp_approx_ftz_f(
+            T.f32(), Float32(a).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+    )
+@dsl_user_op
+@deprecated(
+    "cute.arch.exp2 is deprecated, use cute.math.exp2 with `fastmath=True` instead"
+)
+def exp2(a: Union[float, Float32], *, loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(loc=loc, ip=ip)],
+            "ex2.approx.ftz.f32 $0, $1;",
+            "=f,f",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+@dsl_user_op
+@deprecated(
+    "cute.arch.exp is deprecated, use cute.math.exp with `fastmath=True` instead"
+)
+def exp(a: Union[float, Float32], *, loc=None, ip=None) -> Float32:
+    LOG2_E = 1.4426950408889634
+    return exp2(a * LOG2_E, loc=loc, ip=ip)
+@dsl_user_op
+@deprecated(
+    "cute.arch.exp_packed_f32x2 is deprecated, use cute.arch.mul_packed_f32x2 and cute.math.exp2 with `fastmath=True` instead"
+)
+def exp_packed_f32x2(
+    a: Tuple[Float32, Float32], *, loc=None, ip=None
+) -> Tuple[Float32, Float32]:
+    LOG2_E = Float32(1.4426950408889634)
+    b = mul_packed_f32x2(a, (LOG2_E, LOG2_E), loc=loc, ip=ip)
+    return exp2(b[0], loc=loc, ip=ip), exp2(b[1], loc=loc, ip=ip)

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/smem.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from typing import Optional, Type
+from cutlass.cutlass_dsl import T, dsl_user_op
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+from ..typing import Pointer, Numeric, NumericMeta
+@dsl_user_op
+def alloc_smem(
+    element_type: Type[Numeric],
+    size_in_elems: int,
+    alignment: Optional[int] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    """
+    Statically allocates SMEM.
+    :param element_type:  The pointee type of the pointer.
+    :type element_type:   Type[Numeric]
+    :param size_in_elems: The size of the allocation in terms of number of elements of the
+                          pointee type
+    :type size_in_elems:  int
+    :param alignment:     An optional pointer alignment for the allocation
+    :type alignment:      int
+    :return:              A pointer to the start of the allocation
+    :rtype:               Pointer
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(
+            f"element_type must be a type of Numeric, but got {element_type}"
+        )
+    if alignment is None:
+        # Default alignment based on the element type's width
+        alignment = element_type.width // 8
+    ptr_ty = _cute_ir.PtrType.get(
+        element_type.mlir_type, _cute_ir.AddressSpace.smem, alignment
+    )
+    return _cute_nvgpu_ir.arch_alloc_smem(
+        ptr=ptr_ty,
+        input=ir.IntegerAttr.get(T.i32(), size_in_elems),
+        loc=loc,
+        ip=ip,
+    )
+@dsl_user_op
+def get_dyn_smem(
+    element_type: Type[Numeric],
+    alignment: Optional[int] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    """
+    Retrieves a pointer to a dynamic SMEM allocation.
+    :param element_type:  The pointee type of the pointer.
+    :type element_type:   Type[Numeric]
+    :param alignment:     An optional pointer alignment, the result pointer is offset appropriately
+    :type alignment:      int
+    :return:              A pointer to the start of the dynamic SMEM allocation with a correct
+                          alignement
+    :rtype:               Pointer
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(
+            f"element_type must be a type of Numeric, but got {element_type}"
+        )
+    if alignment is None:
+        # Default alignment based on the element type's width
+        alignment = element_type.width // 8
+    ptr_ty = _cute_ir.PtrType.get(
+        element_type.mlir_type,
+        _cute_ir.AddressSpace.smem,
+        alignment,
+    )
+    return _cute_nvgpu_ir.arch_get_dyn_smem(ptr=ptr_ty, loc=loc, ip=ip)
+@dsl_user_op
+def get_dyn_smem_size(*, loc=None, ip=None) -> int:
+    """
+    Gets the size in bytes of the dynamic shared memory that was specified at kernel launch time.
+    This can be used for bounds checking during shared memory allocation.
+    :return: The size of dynamic shared memory in bytes
+    :rtype:  int
+    """
+    return _cute_nvgpu_ir.arch_get_dyn_smem_size(loc=loc, ip=ip)

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/tmem.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from typing import Type
+from cutlass.cutlass_dsl import dsl_user_op
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from ..typing import Pointer, Int, Int32, Numeric, NumericMeta
+SM100_TMEM_CAPACITY_COLUMNS = 512
+SM100_TMEM_MIN_ALLOC_COLUMNS = 32
+@dsl_user_op
+def retrieve_tmem_ptr(
+    element_type: Type[Numeric],
+    alignment: int,
+    ptr_to_buffer_holding_addr: Pointer,
+    *,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    """
+    Retrieves a pointer to TMEM with the provided element type and alignment.
+    :param element_type:               The pointee type of the pointer.
+    :type element_type:                Type[Numeric]
+    :param alignment:                  The alignment of the result pointer
+    :type alignment:                   int
+    :param ptr_to_buffer_holding_addr: A pointer to a SMEM buffer holding the TMEM address of the
+                                       start of the allocation allocation
+    :type ptr_to_buffer_holding_addr:  Pointer
+    :return:                           A pointer to TMEM
+    :rtype:                            Pointer
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(
+            f"element_type must be a type of Numeric, but got {element_type}"
+        )
+    res_ty = _cute_ir.PtrType.get(
+        element_type.mlir_type, _cute_ir.AddressSpace.tmem, alignment
+    )
+    return _cute_nvgpu_ir.arch_sm100_retrieve_tmem_ptr(
+        res_ty, ptr_to_buffer_holding_addr.value, loc=loc, ip=ip
+    )
+@dsl_user_op
+def alloc_tmem(
+    num_columns: Int,
+    smem_ptr_to_write_address: Pointer,
+    is_two_cta=None,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Allocates TMEM.
+    :param num_columns: The number of TMEM columns to allocate
+    :type num_columns:  Int
+    :param smem_ptr_to_write_address: A pointer to a SMEM buffer where the TMEM address is written
+                                      to
+    :type smem_ptr_to_write_address:  Pointer
+    :param is_two_cta:                Optional boolean parameter for 2-CTA MMAs
+    """
+    if isinstance(num_columns, int):
+        if (
+            num_columns < SM100_TMEM_MIN_ALLOC_COLUMNS
+            or num_columns > SM100_TMEM_CAPACITY_COLUMNS
+            or not (num_columns & (num_columns - 1) == 0)
+        ):
+            raise ValueError(
+                f"num_columns must be between 32 and 512, and must be pow of 2, but got {num_columns}"
+            )
+    _cute_nvgpu_ir.arch_sm100_alloc_tmem(
+        Int32(num_columns).ir_value(loc=loc, ip=ip),
+        smem_ptr_to_write_address.value,
+        is_two_cta=is_two_cta,
+        loc=loc,
+        ip=ip,
+    )
+@dsl_user_op
+def relinquish_tmem_alloc_permit(is_two_cta=None, *, loc=None, ip=None) -> None:
+    """
+    Relinquishes the right to allocate TMEM so that other CTAs potentially in a different grid can
+    allocate.
+    """
+    _cute_nvgpu_ir.arch_sm100_relinquish_tmem_alloc_permit(
+        is_two_cta=is_two_cta, loc=loc, ip=ip
+    )
+@dsl_user_op
+def dealloc_tmem(
+    tmem_ptr: Pointer,
+    num_columns: Int,
+    is_two_cta=None,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Deallocates TMEM using the provided pointer and number of columns.
+    :param tmem_ptr:    A pointer to the TMEM allocation to de-allocate
+    :type tmem_ptr:     Pointer
+    :param num_columns: The number of columns in the TMEM allocation
+    :type num_columns:  Int
+    :param is_two_cta:  Optional boolean parameter for 2-CTA MMAs
+    """
+    if isinstance(num_columns, int):
+        if (
+            num_columns < SM100_TMEM_MIN_ALLOC_COLUMNS
+            or num_columns > SM100_TMEM_CAPACITY_COLUMNS
+            or not (num_columns & (num_columns - 1) == 0)
+        ):
+            raise ValueError(
+                f"num_columns must be between 32 and 512, and must be pow of 2, but got {num_columns}"
+            )
+    _cute_nvgpu_ir.arch_sm100_dealloc_tmem(
+        tmem_ptr.value,
+        Int32(num_columns).ir_value(loc=loc, ip=ip),
+        is_two_cta=is_two_cta,
+        loc=loc,
+        ip=ip,
+    )

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/core.py ADDED Viewed

The diff for this file is too large to render. See raw diff

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/math.py ADDED Viewed

	@@ -0,0 +1,445 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from .core import TensorSSA
+from .typing import Numeric
+from cutlass._mlir.dialects import math, arith
+from typing import Callable, Union
+def _math_op(func: Callable, fastmath: bool, *args, **kwargs):
+    """Dispatch the function to either a TensorSSA or a Numeric(Float).
+    :param func: The function to dispatch
+    :param args: The input tensor or scalar
+    :param kwargs: The input tensor or scalar
+    """
+    arg_type = type(args[0])
+    for arg in args:
+        if not isinstance(arg, TensorSSA) and (
+            not isinstance(arg, Numeric) or not type(arg).is_float
+        ):
+            raise TypeError(
+                f"Expected a TensorSSA or Numeric(Float), but got {type(arg)}"
+            )
+        if not isinstance(arg, arg_type):
+            raise TypeError(
+                f"Expected all inputs to be of type {arg_type}, but got {type(arg)}"
+            )
+    fastmath_flag = arith.FastMathFlags.fast if fastmath else arith.FastMathFlags.none
+    if isinstance(args[0], TensorSSA):
+        return TensorSSA(
+            func(*args, fastmath=fastmath_flag), args[0].shape, args[0].dtype
+        )
+    else:
+        args = [a.ir_value() for a in args]
+        return func(*args, fastmath=fastmath_flag)
+def acos(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise arc cosine of the input tensor.
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the arc cosine of each element in input tensor
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = acos(y)  # Compute arc cosine
+    """
+    return _math_op(math.acos, fastmath, a)
+def asin(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise arc sine of the input tensor.
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the arc sine of each element in input tensor
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = asin(y)  # Compute arc sine
+    """
+    return _math_op(math.asin, fastmath, a)
+def atan(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise arc tangent of the input tensor.
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the arc tangent of each element in input tensor
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = atan(y)  # Compute arc tangent
+    """
+    raise NotImplementedError("atan is not implemented")
+    return _math_op(math.atan, fastmath, a)
+def atan2(
+    a: Union[TensorSSA, Numeric], b: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise arc tangent of two tensors.
+    Computes atan2(a, b) element-wise. The function atan2(a, b) is the angle in radians
+    between the positive x-axis and the point given by the coordinates (b, a).
+    :param a: First input tensor (y-coordinates)
+    :type a: Union[TensorSSA, Numeric]
+    :param b: Second input tensor (x-coordinates)
+    :type b: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the arc tangent of a/b element-wise
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        y = cute.make_fragment(ptr1, layout).load()  # y coordinates
+        x = cute.make_fragment(ptr2, layout).load()  # x coordinates
+        theta = atan2(y, x)  # Compute angles
+    """
+    return _math_op(math.atan2, fastmath, a, b)
+def cos(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise cosine of the input tensor.
+    :param a: Input tensor (in radians)
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the cosine of each element
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = cos(y)  # Compute cosine
+    """
+    return _math_op(math.cos, fastmath, a)
+def erf(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise error function of the input tensor.
+    The error function is defined as:
+    erf(x) = 2/√π ∫[0 to x] exp(-t²) dt
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the error function value for each element
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = erf(y)  # Compute error function
+    """
+    return _math_op(math.erf, fastmath, a)
+def exp(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise exponential of the input tensor.
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the exponential of each element
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = exp(y)  # Compute exponential
+    """
+    return _math_op(math.exp, fastmath, a)
+def exp2(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise base-2 exponential of the input tensor.
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing 2 raised to the power of each element
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = exp2(y)  # Compute 2^x
+    """
+    return _math_op(math.exp2, fastmath, a)
+def log(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise natural logarithm of the input tensor.
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the natural logarithm of each element
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = log(y)  # Compute natural logarithm
+    """
+    return _math_op(math.log, fastmath, a)
+def log2(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise base-2 logarithm of the input tensor.
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the base-2 logarithm of each element
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = log2(y)  # Compute log base 2
+    """
+    return _math_op(math.log2, fastmath, a)
+def log10(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise base-10 logarithm of the input tensor.
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the base-10 logarithm of each element
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = log10(y)  # Compute log base 10
+    """
+    return _math_op(math.log10, fastmath, a)
+def rsqrt(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise reciprocal square root of the input tensor.
+    Computes 1/√x element-wise.
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the reciprocal square root of each element
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = rsqrt(y)  # Compute 1/√x
+    """
+    return _math_op(math.rsqrt, fastmath, a)
+def sin(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise sine of the input tensor.
+    :param a: Input tensor (in radians)
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the sine of each element
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = sin(y)  # Compute sine
+    """
+    return _math_op(math.sin, fastmath, a)
+def sqrt(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise square root of the input tensor.
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the square root of each element
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = sqrt(y)  # Compute square root
+    """
+    return _math_op(math.sqrt, fastmath, a)
+def tan(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise tangent of the input tensor.
+    :param a: Input tensor (in radians)
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the tangent of each element
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = tan(y)  # Compute tangent
+    """
+    return _math_op(math.tan, fastmath, a)
+def tanh(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise hyperbolic tangent of the input tensor.
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the hyperbolic tangent of each element
+    :rtype: Union[TensorSSA, Numeric]
+    Example:
+    .. code-block::
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = tanh(y)  # Compute hyperbolic tangent
+    """
+    return _math_op(math.tanh, fastmath, a)
+__all__ = [
+    "acos",
+    "asin",
+    "atan",
+    "atan2",
+    "cos",
+    "erf",
+    "exp",
+    "exp2",
+    "log",
+    "log10",
+    "log2",
+    "rsqrt",
+    "sin",
+    "sqrt",
+    "tan",
+    "tanh",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from . import warp
+from . import cpasync
+from . import warpgroup
+from . import tcgen05
+from .common import *
+from .helpers import *
+# __all__ is required here for documentation generation
+__all__ = [
+    "OpError",
+    "MmaUniversalOp",
+    "CopyUniversalOp",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/common.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+import enum
+from dataclasses import dataclass
+from typing import Type, Optional
+from cutlass.cutlass_dsl import DSLBaseError
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+from .. import core
+from ..typing import Float16, Float32, Float64, Numeric
+class OpError(DSLBaseError):
+    """
+    An exception class for Op construction errors.
+    """
+    def __init__(
+        self, op: core.Op, message: str, suggestion: Optional[str] = None
+    ) -> None:
+        if suggestion is None:
+            # Default suggestion
+            suggestion = "Check your Op construction code"
+        super().__init__(
+            message,
+            error_code=f"{op.__class__.__name__} error",
+            suggestion=suggestion,
+        )
+####################################################################################################
+#
+# MMA Ops and Traits
+#
+####################################################################################################
+@dataclass(frozen=True)
+class MmaUniversalOp(core.MmaOp):
+    """
+    The universal MMA Operation.
+    This Operation currently expects the A/B operands as well as the accumulator to share the same
+    data types.
+    :param abacc_dtype: The data type for the A/B operands and the accumulator
+    :type abacc_dtype:  Type[Numeric]
+    """
+    abacc_dtype: Type[Numeric]
+    def __post_init__(self) -> None:
+        if self.abacc_dtype not in [Float16, Float32, Float64]:
+            raise OpError(
+                self,
+                f"expects the 'abacc_dtype' Op parameter to be one of Float16, Float32, or Float64",
+            )
+    def __str__(self) -> str:
+        return (
+            "universal MMA Operation using FMA"
+            f"\n  A/B/Accumulator data type = {self.abacc_dtype}"
+        )
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaUniversalTrait":
+        shape_mnk_attr = ir.Attribute.parse(f'#cute.shape<"(1,1,1)">')
+        atom_ty = _cute_nvgpu_ir.UniversalFmaAtomType.get(
+            shape_mnk_attr,
+            self.abacc_dtype.mlir_type,
+            self.abacc_dtype.mlir_type,
+            self.abacc_dtype.mlir_type,
+        )
+        return MmaUniversalTrait(_cute_ir.atom(atom_ty, loc=loc, ip=ip))
+    def _verify_fragment_A(self, input, *, loc=None, ip=None):
+        pass
+    def _verify_fragment_B(self, input, *, loc=None, ip=None):
+        pass
+class MmaUniversalTrait(core.Trait):
+    pass
+####################################################################################################
+#
+# Copy Ops and Traits
+#
+####################################################################################################
+class MemoryOrder(enum.Enum):
+    WEAK = _cute_ir.MemOrderKind.WEAK
+    RELAXED = _cute_ir.MemOrderKind.RELAXED
+    ACQUIRE = _cute_ir.MemOrderKind.ACQUIRE
+    RELEASE = _cute_ir.MemOrderKind.RELEASE
+    ACQ_REL = _cute_ir.MemOrderKind.ACQ_REL
+    SC = _cute_ir.MemOrderKind.SC
+    MMIO = _cute_ir.MemOrderKind.MMIO
+    CONSTANT = _cute_ir.MemOrderKind.CONSTANT
+    VOLATILE = _cute_ir.MemOrderKind.VOLATILE
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+    def _to_ir(self) -> _cute_ir.MemOrderKind:
+        return self.value
+class MemoryScope(enum.Enum):
+    CTA = _cute_ir.MemScopeKind.CTA
+    CLUSTER = _cute_ir.MemScopeKind.CLUSTER
+    GPU = _cute_ir.MemScopeKind.GPU
+    SYS = _cute_ir.MemScopeKind.SYS
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+    def _to_ir(self) -> _cute_ir.MemScopeKind:
+        return self.value
+@dataclass(frozen=True)
+class CopyUniversalOp(core.CopyOp):
+    """
+    The universal Copy Operation.
+    When creating a Copy Atom out of this operation, the expected usage pattern is
+    .. code-block:: python
+        op = cute.nvgpu.CopyUniversalOp()
+        atom = cute.make_copy_atom(op, tensor_dtype, num_bits_per_copy=64)
+    - ``tensor_dtype`` is the data type used to build the reference TV Layout (either the source \
+        or the destination TV Layout) in unit of tensor elements and is used for partitioning by \
+        ``TiledCopy`` for example
+    - ``num_bits_per_copy`` is a kw argument specifying the number of bits to copy per Atom \
+        execution. This can be larger than the width of the above data type. When not provided, \
+        the compiler will do a best effort at auto-vectorizing.
+    """
+    def __str__(self) -> str:
+        return "universal Copy Operation"
+    def _make_trait(
+        self,
+        copy_internal_type: Type[Numeric],
+        *,
+        loc=None,
+        ip=None,
+        **kwargs,
+    ) -> "CopyUniversalTrait":
+        num_bits_per_copy = kwargs.get("num_bits_per_copy", 0)
+        memory_order = kwargs.get("memory_order", MemoryOrder.WEAK)
+        memory_scope = kwargs.get("memory_scope", MemoryScope.CTA)
+        if not isinstance(num_bits_per_copy, int) or (num_bits_per_copy < 0):
+            raise ValueError(
+                "expects a 'num_bits_per_copy' kw argument of type int that is non-negative "
+                f"when creating a copy Atom for {self.__class__.__name__}"
+            )
+        ty = _cute_nvgpu_ir.CopyAtomSIMTSyncCopyType.get(
+            copy_internal_type.mlir_type,
+            num_bits_per_copy,
+            memory_order._to_ir(),
+            memory_scope._to_ir(),
+        )
+        return CopyUniversalTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class CopyUniversalTrait(core.Trait):
+    pass

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from .copy import *
+from .helpers import *
+# __all__ is required here for documentation generation
+__all__ = [
+    #
+    # copy.py
+    #
+    "LoadCacheMode",
+    "CopyG2SOp",
+    "CopyBulkTensorTileG2SOp",
+    "CopyBulkTensorTileG2SMulticastOp",
+    "CopyBulkTensorTileS2GOp",
+    "CopyReduceBulkTensorTileS2GOp",
+    #
+    # helpers.py
+    #
+    "make_tiled_tma_atom",
+    "tma_partition",
+    "create_tma_multicast_mask",
+    "prefetch_descriptor",
+    "copy_tensormap",
+    "update_tma_descriptor",
+    "fence_tma_desc_acquire",
+    "cp_fence_tma_desc_release",
+    "fence_tma_desc_release",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py ADDED Viewed

	@@ -0,0 +1,471 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+import enum
+from dataclasses import dataclass
+from typing import Optional, Type
+from cutlass.cutlass_dsl import CuTeDSL, t
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+from ...core import CopyOp, Trait, ReductionOp
+from ...typing import Int16, Pointer, Integer, Numeric
+from ..common import OpError
+from ..tcgen05.mma import CtaGroup
+####################################################################################################
+#
+# Aynchronous copies
+#
+####################################################################################################
+class LoadCacheMode(enum.Enum):
+    """
+    An enumeration for the possible cache modes of a non-bulk ``cp.async`` instruction.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#cache-operators>`__.
+    """
+    ALWAYS = _cute_nvgpu_ir.LoadCacheMode.always
+    GLOBAL = _cute_nvgpu_ir.LoadCacheMode.global_
+    STREAMING = _cute_nvgpu_ir.LoadCacheMode.streaming
+    LAST_USE = _cute_nvgpu_ir.LoadCacheMode.last_use
+    NONE = _cute_nvgpu_ir.LoadCacheMode.none
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+    def _to_ir(self) -> _cute_nvgpu_ir.LoadCacheMode:
+        return self.value
+@dataclass(frozen=True)
+class CopyG2SOp(CopyOp):
+    """
+    Non-bulk asynchronous GMEM to SMEM Copy Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-non-bulk-copy>`__.
+    """
+    cache_mode: LoadCacheMode = LoadCacheMode.ALWAYS
+    def __str__(self) -> str:
+        res = "cp.async GMEM -> SMEM copy Operation"
+        if self.cache_mode != LoadCacheMode.ALWAYS:
+            res += f"\n  with cache mode = {self.cache_mode}"
+        return res
+    def _make_trait(
+        self,
+        copy_internal_type: Type[t.Numeric],
+        *,
+        loc=None,
+        ip=None,
+        **kwargs,
+    ) -> "CopyG2STrait":
+        num_bits_per_copy = kwargs.get("num_bits_per_copy", None)
+        # Verify that the user provided enum values
+        if not isinstance(self.cache_mode, LoadCacheMode):
+            raise OpError(
+                self,
+                "expects the 'cache_mode' Op parameter to be a LoadCacheMode instance",
+            )
+        if not isinstance(num_bits_per_copy, int) or (num_bits_per_copy <= 0):
+            raise ValueError(
+                "expects a 'num_bits_per_copy' kw argument of type int that is positive "
+                f"when creating a copy Atom for {self.__class__.__name__}"
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.cache_mode, LoadCacheMode):
+            raise OpError(
+                self,
+                "expects the 'cache_mode' Op parameter to be a LoadCacheMode instance",
+            )
+        ty = _cute_nvgpu_ir.CopyAtomSIMTAsyncCopyType.get(
+            copy_internal_type.mlir_type, self.cache_mode._to_ir(), num_bits_per_copy
+        )
+        return CopyG2STrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class CopyG2STrait(Trait):
+    pass
+####################################################################################################
+#
+# Bulk tensor copies a.k.a TMA copies
+#
+####################################################################################################
+TMA_MBAR_PTR_FIELD_NAME = "tma_bar"
+TMA_MASK_FIELD_NAME = "mcast_mask"
+TMA_DESC_PTR_FIELD_NAME = "tma_descriptor_ptr"
+#
+# TMA GMEM -> SMEM copies
+#
+@dataclass(frozen=True)
+class CopyBulkTensorTileG2SOp(CopyOp):
+    """
+    Bulk tensor asynchrnous GMEM to SMEM Copy Operation using the TMA unit.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
+    This Operation uses TMA in the ``.tile`` mode.
+    """
+    cta_group: CtaGroup = CtaGroup.ONE
+    admissible_archs = [
+        "sm_90",
+        "sm_90a",
+        "sm_100a",
+        "sm_100f",
+    ]
+    def __post_init__(self) -> None:
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self, "expects the 'cta_group' parameter to be a CtaGroup instance"
+            )
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        if (self.cta_group == CtaGroup.TWO) and arch[:5] == "sm_90":
+            raise OpError(
+                self,
+                f"CTA group of 2 is tcgen05-specific and is not and is not compatible with {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+    def __str__(self) -> str:
+        res = "cp.async GMEM -> SMEM bulk tensor copy Operation"
+        if self.cta_group == CtaGroup.TWO:
+            res += f"\n  CTA group = 2"
+        return res
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "CopyBulkTensorTileG2SNonExecTrait":
+        raise NotImplementedError(
+            "Use cpasync.make_tiled_tma_atom to obtain a copy Atom for TMA"
+        )
+    def _to_ir(self) -> _cute_nvgpu_ir.TiledTmaLoadEnum:
+        if self.cta_group == CtaGroup.ONE:
+            return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_90
+        elif self.cta_group == CtaGroup.TWO:
+            return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_100_2sm
+        else:
+            assert False, "unrecognized self.cta_group"
+class CopyBulkTensorTileG2SNonExecTrait(Trait):
+    # We allow kw args to be dropped so that the user can write common code for non-multicast
+    # and multicast loads.
+    def unpack(
+        self,
+        *,
+        loc=None,
+        ip=None,
+        tma_bar_ptr: Optional[Pointer] = None,
+        tma_desc_ptr: Optional[Pointer] = None,
+        **kwargs,
+    ):
+        """
+        Custom implementation of unpack for non-executable TMAs.
+        The non-multicast TMA load requires a `tma_bar_ptr` keyword argument to be provided when
+        using `cute.copy`. Any other kw arguments will be ignored instead of triggering an error.
+        """
+        if not isinstance(tma_bar_ptr, Pointer):
+            raise ValueError(
+                "expects a pointer to an mbarrier to be provided via the tma_bar_ptr kw argument"
+            )
+        exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip)
+        attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<{TMA_MBAR_PTR_FIELD_NAME}>"
+        attr = ir.Attribute.parse(attr_str)
+        exec_value = _cute_nvgpu_ir.atom_set_value(
+            exec_value, attr, tma_bar_ptr.value, loc=loc, ip=ip
+        )
+        if isinstance(tma_desc_ptr, Pointer):
+            attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<{TMA_DESC_PTR_FIELD_NAME}>"
+            attr = ir.Attribute.parse(attr_str)
+            exec_value = _cute_nvgpu_ir.atom_set_value(
+                exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip
+            )
+        return exec_value
+#
+# TMA GMEM -> SMEM multicast copies
+#
+@dataclass(frozen=True)
+class CopyBulkTensorTileG2SMulticastOp(CopyOp):
+    """
+    Bulk tensor asynchrnous multicast GMEM to SMEM Copy Operation using the TMA unit.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
+    This Operation uses TMA in the ``.tile`` mode.
+    """
+    cta_group: CtaGroup = CtaGroup.ONE
+    admissible_archs = [
+        "sm_90",
+        "sm_90a",
+        "sm_100a",
+        "sm_100f",
+    ]
+    def __post_init__(self):
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self, "expects the 'cta_group' parameter to be a CtaGroup instance"
+            )
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        if (self.cta_group == CtaGroup.TWO) and arch[:5] == "sm_90":
+            raise OpError(
+                self,
+                f"CTA group of 2 is tcgen05-specific and is not and is not compatible with {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+    def __str__(self) -> str:
+        res = "cp.async GMEM -> SMEM bulk tensor multicast copy Operation"
+        if self.cta_group == CtaGroup.TWO:
+            res += f"\n  CTA group = 2"
+        return res
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "CopyBulkTensorTileG2SMulticastNonExecTrait":
+        raise NotImplementedError(
+            "Use cpasync.make_tiled_tma_atom to obtain a copy Atom for TMA"
+        )
+    def _to_ir(self) -> _cute_nvgpu_ir.TiledTmaLoadEnum:
+        if self.cta_group == CtaGroup.ONE:
+            return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_90_multicast
+        elif self.cta_group == CtaGroup.TWO:
+            return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_100_2sm_multicast
+        else:
+            assert False, "unrecognized self.cta_group"
+class CopyBulkTensorTileG2SMulticastNonExecTrait(Trait):
+    def unpack(
+        self,
+        *,
+        loc=None,
+        ip=None,
+        tma_bar_ptr: Optional[Pointer] = None,
+        mcast_mask=None,
+        tma_desc_ptr=None,
+    ):
+        """
+        Custom implementation of unpack for non-executable TMAs.
+        The multicast TMA load requires a `tma_bar_ptr`  and a `mcast_mask` keyword arguments to be
+        provided when using `cute.copy`.
+        """
+        if not isinstance(tma_bar_ptr, Pointer):
+            raise ValueError(
+                "expects a pointer to an mbarrier to be provided via the tma_bar_ptr kw argument"
+            )
+        if not isinstance(mcast_mask, Integer):
+            raise ValueError(
+                "expects a multicast mask to be provided via the mcast_mask kw argument"
+            )
+        exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip)
+        attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<tma_bar>"
+        attr = ir.Attribute.parse(attr_str)
+        exec_value = _cute_nvgpu_ir.atom_set_value(
+            exec_value, attr, tma_bar_ptr.value, loc=loc, ip=ip
+        )
+        attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<mcast_mask>"
+        attr = ir.Attribute.parse(attr_str)
+        exec_value = _cute_nvgpu_ir.atom_set_value(
+            exec_value, attr, Int16(mcast_mask).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+        if isinstance(tma_desc_ptr, Pointer):
+            attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<{TMA_DESC_PTR_FIELD_NAME}>"
+            attr = ir.Attribute.parse(attr_str)
+            exec_value = _cute_nvgpu_ir.atom_set_value(
+                exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip
+            )
+        return exec_value
+#
+# TMA SMEM -> GMEM copies
+#
+@dataclass(frozen=True)
+class CopyBulkTensorTileS2GOp(CopyOp):
+    """
+    Bulk tensor asynchronous SMEM to GMEM Copy Operation using the TMA unit.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
+    This Operation uses TMA in the ``.tile`` mode.
+    """
+    admissible_archs = [
+        "sm_90",
+        "sm_90a",
+        "sm_100a",
+        "sm_100f",
+    ]
+    def __post_init__(self):
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+    def __str__(self) -> str:
+        return "cp.async SMEM -> GMEM bulk tensor copy Operation"
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "CopyBulkTensorTileS2GTrait":
+        raise NotImplementedError(
+            "Use cpasync.make_tiled_tma_atom to obtain a copy Atom for TMA"
+        )
+class CopyBulkTensorTileS2GTrait(Trait):
+    def unpack(self, *, loc=None, ip=None, tma_desc_ptr: Optional[Pointer] = None):
+        """
+        Custom implementation of unpack for non-executable TMAs.
+        """
+        exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip)
+        if isinstance(tma_desc_ptr, Pointer):
+            attr_str = (
+                f"#cute_nvgpu.atom_copy_field_tmastore<{TMA_DESC_PTR_FIELD_NAME}>"
+            )
+            attr = ir.Attribute.parse(attr_str)
+            exec_value = _cute_nvgpu_ir.atom_set_value(
+                exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip
+            )
+        return exec_value
+@dataclass(frozen=True)
+class CopyReduceBulkTensorTileS2GOp(CopyOp):
+    """
+    Bulk tensor asynchronous SMEM to GMEM Reduction Operation using the TMA unit.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-reduce-async-bulk>`__.
+    This Operation uses TMA in the ``.tile`` mode.
+    """
+    reduction_kind: ReductionOp = ReductionOp.ADD
+    admissible_archs = [
+        "sm_90",
+        "sm_90a",
+        "sm_100a",
+        "sm_100f",
+    ]
+    def __post__init__(self):
+        # Arch verification
+        arch = CuTeDSL.__get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+    def __str__(self) -> str:
+        return "cp.async SMEM -> GMEM bulk tensor reduction Operation"
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "CopyReduceBulkTensorTileS2GTrait":
+        raise NotImplementedError(
+            "Use cpasync.make_tiled_tma_atom to obtain a copy Atom for TMA"
+        )
+    def _to_ir(self) -> _cute_nvgpu_ir.ReductionKind:
+        if self.reduction_kind == ReductionOp.ADD:
+            return _cute_nvgpu_ir.ReductionKind.ADD
+        elif self.reduction_kind == ReductionOp.MIN:
+            return _cute_nvgpu_ir.ReductionKind.MIN
+        elif self.reduction_kind == ReductionOp.MAX:
+            return _cute_nvgpu_ir.ReductionKind.MAX
+        elif self.reduction_kind == ReductionOp.INC:
+            return _cute_nvgpu_ir.ReductionKind.INC
+        elif self.reduction_kind == ReductionOp.DEC:
+            return _cute_nvgpu_ir.ReductionKind.DEC
+        elif self.reduction_kind == ReductionOp.AND:
+            return _cute_nvgpu_ir.ReductionKind.AND
+        elif self.reduction_kind == ReductionOp.OR:
+            return _cute_nvgpu_ir.ReductionKind.OR
+        elif self.reduction_kind == ReductionOp.XOR:
+            return _cute_nvgpu_ir.ReductionKind.XOR
+        else:
+            assert False, "unrecognized self.reduction_kind"
+class CopyReduceBulkTensorTileS2GTrait(Trait):
+    def unpack(self, *, loc=None, ip=None, tma_desc_ptr: Optional[Pointer] = None):
+        """
+        Custom implementation of unpack for non-executable TMAs.
+        """
+        exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip)
+        if isinstance(tma_desc_ptr, Pointer):
+            attr_str = (
+                f"#cute_nvgpu.atom_copy_field_tmareduce<{TMA_DESC_PTR_FIELD_NAME}>"
+            )
+            attr = ir.Attribute.parse(attr_str)
+            exec_value = _cute_nvgpu_ir.atom_set_value(
+                exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip
+            )
+        return exec_value
+__all__ = [
+    "LoadCacheMode",
+    "CopyG2SOp",
+    "CopyBulkTensorTileG2SOp",
+    "CopyBulkTensorTileG2SMulticastOp",
+    "CopyBulkTensorTileS2GOp",
+    "CopyReduceBulkTensorTileS2GOp",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py ADDED Viewed

	@@ -0,0 +1,341 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from typing import Optional, Tuple, Type, Union
+from cutlass.cutlass_dsl import dsl_user_op
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir.dialects import llvm
+from ...typing import Coord, Layout, Tensor, Tiler, Pointer, Int16, Numeric, NumericMeta
+from ... import core
+from .copy import (
+    CopyBulkTensorTileG2SOp,
+    CopyBulkTensorTileG2SMulticastOp,
+    CopyBulkTensorTileS2GOp,
+    CopyReduceBulkTensorTileS2GOp,
+    CopyBulkTensorTileG2SNonExecTrait,
+    CopyBulkTensorTileG2SMulticastNonExecTrait,
+    CopyBulkTensorTileS2GTrait,
+    CopyReduceBulkTensorTileS2GTrait,
+)
+@dsl_user_op
+def make_tiled_tma_atom(
+    op: Union[
+        CopyBulkTensorTileG2SOp,
+        CopyBulkTensorTileG2SMulticastOp,
+        CopyBulkTensorTileS2GOp,
+        CopyReduceBulkTensorTileS2GOp,
+    ],
+    gmem_tensor: Tensor,
+    smem_layout: Union[Layout, core.ComposedLayout],
+    cta_tiler: Tiler,
+    num_multicast: int = 1,
+    *,
+    internal_type: Optional[Type[Numeric]] = None,
+    loc=None,
+    ip=None,
+) -> Tuple[core.CopyAtom, Tensor]:
+    """
+    Makes a TMA Copy Atom in the ``.tile`` mode to copy tiles of a GMEM tensor to/from SMEM
+    buffer with the given Layout.
+    Given
+    - a GMEM tensor
+    - a SMEM layout
+    - a CTA-level Tiler
+    this function figures out the bulk tensor asynchronous copy instruction to use with the maximum
+    "TMA vector length" to copy tiles of the GMEM tensor to/from an SMEM buffer with the provided
+    layout and consistent with the provided Tiler.
+    This function returns two results:
+    1. the Copy Atom
+    2. the so-called TMA tensor used to map logical coordinates of the GMEM tensor to coordinates \
+       that the TMA unit can consume. TMA tensors have so-called basis stride elements so that the \
+       associated layout can output coordinates. Otherwise, TMA tensors can be partitioned \
+       similarly to any other CuTe tensors using the algebra.
+    :param op:            The Copy Operation to construct an Atom for
+    :type op:             Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp, CopyBulkTensorTileS2GOp, CopyReduceBulkTensorTileS2GOp]
+    :param gmem_tensor:   The GMEM tensor involved in the Copy
+    :type gmem_tensor:    Tensor
+    :param smem_layout:   The SMEM layout to construct the Copy Atom for
+    :type smem_layout:    Union[Layout, core.ComposedLayout]
+    :param cta_tiler:     The CTA Tiler to use
+    :type cta_tiler:      Tiler
+    :param num_multicast: The multicast factor
+    :type num_multicast:  int
+    :param internal_type: An optional parameter for the internal data type to use when the actual data type is not supported by the TMA unit
+    :type internal_type:  Type[Numeric]
+    :return:              A Copy Atom for this Operation and the associated TMA tensor
+    :rtype:               Tuple[core.CopyAtom, Tensor]
+    """
+    if internal_type is not None:
+        if not isinstance(internal_type, NumericMeta):
+            raise TypeError(f"internal_type must be a Numeric, but got {internal_type}")
+        internal_type = internal_type.mlir_type
+    cta_v_map = core.composition(
+        core.make_identity_layout(gmem_tensor.shape, loc=loc, ip=ip),
+        cta_tiler,
+        loc=loc,
+        ip=ip,
+    )
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        if num_multicast != 1:
+            raise ValueError(
+                f"expects num_multicast to be 1 for non multicast G2S copies, "
+                f"but got {num_multicast}"
+            )
+        res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load(
+            gmem_tensor.value,
+            smem_layout,
+            cta_v_map,
+            op._to_ir(),
+            num_multicast=num_multicast,
+            internal_type=internal_type,
+            loc=loc,
+            ip=ip,
+        )
+        return core.CopyAtom(op, CopyBulkTensorTileG2SNonExecTrait(res[0])), res[1]
+    elif isinstance(op, CopyBulkTensorTileG2SMulticastOp):
+        if num_multicast < 1:
+            raise ValueError(
+                f"expects num_multicast to be >= 1 for multicast G2S copies, "
+                f"but got {num_multicast}"
+            )
+        res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load(
+            gmem_tensor.value,
+            smem_layout,
+            cta_v_map,
+            op._to_ir(),
+            num_multicast=num_multicast,
+            internal_type=internal_type,
+            loc=loc,
+            ip=ip,
+        )
+        return (
+            core.CopyAtom(op, CopyBulkTensorTileG2SMulticastNonExecTrait(res[0])),
+            res[1],
+        )
+    elif isinstance(op, CopyBulkTensorTileS2GOp):
+        res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_store(
+            gmem_tensor.value,
+            smem_layout,
+            cta_v_map,
+            internal_type=internal_type,
+            loc=loc,
+            ip=ip,
+        )
+        return core.CopyAtom(op, CopyBulkTensorTileS2GTrait(res[0])), res[1]
+    elif isinstance(op, CopyReduceBulkTensorTileS2GOp):
+        res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_reduce(
+            gmem_tensor.value,
+            smem_layout,
+            cta_v_map,
+            op._to_ir(),
+            internal_type=internal_type,
+            loc=loc,
+            ip=ip,
+        )
+        return core.CopyAtom(op, CopyReduceBulkTensorTileS2GTrait(res[0])), res[1]
+    else:
+        raise ValueError(f"expects a bulk tensor (TMA) Copy Op, but got {op}")
+@dsl_user_op
+def tma_partition(
+    atom: core.CopyAtom,
+    cta_coord: Coord,
+    cta_layout: Layout,
+    smem_tensor: Tensor,
+    gmem_tensor: Tensor,
+    *,
+    loc=None,
+    ip=None,
+) -> Tuple[Tensor, Tensor]:
+    """
+    Tiles the GMEM and SMEM tensors for the provided TMA Copy Atom.
+    """
+    cta_coord_val = core._pack_coord(cta_coord, loc=loc, ip=ip)
+    s, d = _cute_nvgpu_ir.atom_tma_partition(
+        atom._trait.value,
+        cta_coord=cta_coord_val,
+        cta_layout=cta_layout,
+        smem_tensor=smem_tensor.value,
+        gmem_tensor=gmem_tensor.value,
+        loc=loc,
+        ip=ip,
+    )
+    return s, d
+@dsl_user_op
+def create_tma_multicast_mask(
+    cta_layout_vmnk: Layout,
+    cta_coord_vmnk: Coord,
+    mcast_mode: int,
+    *,
+    loc=None,
+    ip=None,
+) -> Int16:
+    """
+    Computes a multicast mask for a TMA load Copy.
+    :param cta_layout_vmnk: The VMNK layout of the cluster
+    :type cta_layout_vmnk:  Layout
+    :param cta_coord_vmnk:  The VMNK coordinate of the current CTA
+    :type cta_coord_vmnk:   Coord
+    :param mcast_mode:      The tensor mode in which to multicast
+    :type mcast_mode:       int
+    :return:                The resulting mask
+    :rtype:                 Int16
+    """
+    if core.rank(cta_layout_vmnk) != 4:
+        raise ValueError(
+            f"cta_layout_vmnk must be rank 4, but got {core.pretty_str(cta_layout_vmnk)}"
+        )
+    if core.rank(cta_coord_vmnk) != 4:
+        raise ValueError(
+            f"cta_coord_vmnk must be rank 4, but got {core.pretty_str(cta_coord_vmnk)}"
+        )
+    return core.make_layout_image_mask(
+        cta_layout_vmnk, cta_coord_vmnk, mcast_mode, loc=loc, ip=ip
+    )
+@dsl_user_op
+def prefetch_descriptor(tma_atom: core.CopyAtom, *, loc=None, ip=None) -> None:
+    """
+    Prefetches the TMA descriptor associated with the TMA Atom.
+    """
+    _cute_nvgpu_ir.prefetch_tma_desc(tma_atom._trait.value, loc=loc, ip=ip)
+@dsl_user_op
+def copy_tensormap(
+    tma_atom: core.CopyAtom, tensormap_ptr: Pointer, *, loc=None, ip=None
+) -> None:
+    """
+    Copies the tensormap held by a TMA Copy Atom to the memory location pointed to by the provided
+    pointer.
+    :param tma_atom:      The TMA Copy Atom
+    :type tma_atom:       CopyAtom
+    :param tensormap_ptr: The pointer to the memory location to copy the tensormap to
+    :type tensormap_ptr:  Pointer
+    """
+    _cute_nvgpu_ir.copy_tma_desc(
+        tma_atom._trait.value, tensormap_ptr.value, loc=loc, ip=ip
+    )
+@dsl_user_op
+def update_tma_descriptor(
+    tma_atom: core.CopyAtom,
+    gmem_tensor: Tensor,
+    tma_desc_ptr: Pointer,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Updates the TMA descriptor in the memory location pointed to by the provided pointer using
+    information from a TMA Copy Atom and the provided GMEM tensor.
+    Specifically, the following fields of the TMA descriptor will be updated:
+    1. the GMEM tensor base address
+    2. the GMEM tensor shape
+    3. the GMEM tensor stride
+    Other fields of the TMA descriptor are left unchanged.
+    :param tma_atom:      The TMA Copy Atom
+    :type tma_atom:       CopyAtom
+    :param gmem_tensor:   The GMEM tensor
+    :type gmem_tensor:    Tensor
+    :param tensormap_ptr: The pointer to the memory location of the descriptor to udpate
+    :type tensormap_ptr:  Pointer
+    """
+    _cute_nvgpu_ir.update_tma_desc(
+        tma_atom._trait.value, gmem_tensor.value, tma_desc_ptr.value, loc=loc, ip=ip
+    )
+@dsl_user_op
+def fence_tma_desc_acquire(
+    tma_desc_ptr: Pointer,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    tma_desc_ptr_i64 = tma_desc_ptr.toint(loc=loc, ip=ip).ir_value()
+    llvm.inline_asm(
+        None,
+        [tma_desc_ptr_i64],
+        "fence.proxy.tensormap::generic.acquire.gpu [$0], 128;",
+        "l",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+@dsl_user_op
+def cp_fence_tma_desc_release(
+    tma_desc_global_ptr: Pointer,
+    tma_desc_shared_ptr: Pointer,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy>`__.
+    """
+    tma_desc_global_ptr_i64 = tma_desc_global_ptr.toint(loc=loc, ip=ip).ir_value()
+    tma_desc_shared_ptr_i32 = tma_desc_shared_ptr.toint(loc=loc, ip=ip).ir_value()
+    llvm.inline_asm(
+        None,
+        [tma_desc_global_ptr_i64, tma_desc_shared_ptr_i32],
+        "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [$0], [$1], 128;",
+        "l,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+@dsl_user_op
+def fence_tma_desc_release(*, loc=None, ip=None) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    llvm.inline_asm(
+        None,
+        [],
+        "fence.proxy.tensormap::generic.release.gpu;",
+        "",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/helpers.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from typing import Optional, Tuple, Type, Union
+from cutlass.cutlass_dsl import dsl_user_op
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from .. import core
+from ..typing import Shape, Layout, Tensor, Numeric, NumericMeta
+from ...impl_utils import check_type_in
+from .cpasync.copy import (
+    CopyBulkTensorTileG2SOp,
+    CopyBulkTensorTileG2SNonExecTrait,
+    CopyBulkTensorTileG2SMulticastOp,
+    CopyBulkTensorTileG2SMulticastNonExecTrait,
+)
+####################################################################################################
+#
+# TMA creation helpers for tcgen05 MMAs
+#
+####################################################################################################
+@dsl_user_op
+def make_tiled_tma_atom_A(
+    op: Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp],
+    gmem_tensor: Tensor,
+    smem_layout: Union[Layout, core.ComposedLayout],
+    mma_tiler_mnk: Shape,
+    tiled_mma: core.TiledMma,
+    cluster_shape_vmnk: Shape,
+    *,
+    internal_type: Optional[Type[Numeric]] = None,
+    loc=None,
+    ip=None,
+) -> Tuple[core.CopyAtom, Tensor]:
+    """
+    Makes a TMA Copy atom mapping to ``.tile`` mode for ``cp.async.bulk.tensor`` PTX operation
+    accounting for the MK projections of the TiledMMA for A tensor loads.
+    Given
+    - a GMEM tensor
+    - a SMEM layout
+    - a MMA Tiler
+    - a TiledMma
+    - a Cluster-level shape
+    this function figures out the bulk tensor asynchronous copy instruction to use with the maximum
+    "TMA vector length" to copy tiles of the GMEM tensor to an SMEM buffer with the provided
+    layout and consistent with the provided Tiler & tiled_mma (considering the M-mode & K-mode).
+    The Cluster-level shape is used to determine the multicast factor across the N-mode for A tensor loads.
+    This function returns two results:
+    1. the Copy Atom
+    2. the so-called TMA tensor used to map logical coordinates of the GMEM tensor to coordinates
+       that the TMA unit can consume. TMA tensors have so-called basis stride elements so that the
+       associated layout can output coordinates. Otherwise, TMA tensors can be partitioned
+       similarly to any other CuTe tensors using the algebra.
+    :param op:                 The Copy Operation to construct an Atom for
+    :type op:                  Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp]
+    :param gmem_tensor:        The GMEM tensor to be loaded by this copy atom
+    :type gmem_tensor:         Tensor
+    :param smem_layout:        Shared memory layout to load the tensor into (PDSL)
+    :type smem_layout:         Union[Layout, core.ComposedLayout]
+    :param mma_tiler_mnk:      The MMA Tiler shape (TILE_M, TILE_N, TILE_K) in MNK dimensions
+    :type mma_tiler_mnk:       Shape
+    :param tiled_mma:          The TiledMMA that will consume the load as operands
+    :type tiled_mma:           core.TiledMma
+    :param cluster_shape_vmnk: The Cluster-level shape in VMNK dimensions
+    :type cluster_shape_vmnk:  Shape
+    :param internal_type:      An optional parameter for the internal data type to when element
+                               type does not match the copy type
+    :type internal_type:       Type[Numeric]
+    :return:                   A copy atom for this operation and the associated TMA coord tensor
+    :rtype:                    Tuple[core.CopyAtom, Tensor]
+    """
+    if internal_type is not None:
+        if not isinstance(internal_type, NumericMeta):
+            raise TypeError(f"internal_type must be a Numeric, but got {internal_type}")
+        internal_type = internal_type.mlir_type
+    check_type_in(
+        op,
+        [CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp],
+        "op",
+        "make_tiled_tma_atom_A",
+    )
+    ident = core.make_identity_layout(gmem_tensor.shape, loc=loc, ip=ip)
+    mma_tiler_mk = (mma_tiler_mnk[0], *mma_tiler_mnk[2:])
+    g_tile = core.composition(ident, mma_tiler_mk, loc=loc, ip=ip)
+    cta_v_map = tiled_mma._thrfrg_A(g_tile)
+    cta_v_map = core.get(cta_v_map, mode=[1])
+    cta_v_map = core.dice(cta_v_map, (1, (1,) * core.rank(g_tile)))
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        num_multicast = 1
+    else:
+        assert isinstance(op, CopyBulkTensorTileG2SMulticastOp)
+        # multicast across the N-mode since those would share the same tile of A
+        num_multicast = core.size(cluster_shape_vmnk, mode=[2])
+    # res[0] = the IR Value for the non-executable atom instance
+    # res[1] = the IR Value for the associated TMA tensor
+    res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load(
+        gmem_tensor.value,
+        smem_layout,
+        cta_v_map,
+        op._to_ir(),
+        num_multicast=num_multicast,
+        internal_type=internal_type,
+        loc=loc,
+        ip=ip,
+    )
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        return core.CopyAtom(op, CopyBulkTensorTileG2SNonExecTrait(res[0])), res[1]
+    else:
+        assert isinstance(op, CopyBulkTensorTileG2SMulticastOp)
+        return (
+            core.CopyAtom(op, CopyBulkTensorTileG2SMulticastNonExecTrait(res[0])),
+            res[1],
+        )
+@dsl_user_op
+def make_tiled_tma_atom_B(
+    op: Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp],
+    gmem_tensor: Tensor,
+    smem_layout: Union[Layout, core.ComposedLayout],
+    mma_tiler_mnk: Shape,
+    tiled_mma: core.TiledMma,
+    cluster_shape_vmnk: Shape,
+    *,
+    internal_type: Optional[Type[Numeric]] = None,
+    loc=None,
+    ip=None,
+) -> Tuple[core.CopyAtom, Tensor]:
+    """
+    Makes a TMA Copy atom mapping to ``.tile`` mode for ``cp.async.bulk.tensor`` PTX operation
+    accounting for the NK projections of the TiledMMA for B tensor loads.
+    Given
+    - a GMEM tensor
+    - a SMEM layout
+    - a MMA Tiler
+    - a TiledMma
+    - a Cluster-level shape
+    this function figures out the bulk tensor asynchronous copy instruction to use with the maximum
+    "TMA vector length" to copy tiles of the GMEM tensor to an SMEM buffer with the provided
+    layout and consistent with the provided Tiler & tiled_mma (considering the N-mode & K-mode).
+    The Cluster-level shape is used to determine the multicast factor across the M-mode for B tensor loads.
+    This function returns two results:
+    1. the Copy Atom
+    2. the so-called TMA tensor used to map logical coordinates of the GMEM tensor to coordinates
+       that the TMA unit can consume. TMA tensors have so-called basis stride elements so that the
+       associated layout can output coordinates. Otherwise, TMA tensors can be partitioned
+       similarly to any other CuTe tensors using the algebra.
+    :param op:                 The Copy Operation to construct an Atom for
+    :type op:                  Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp]
+    :param gmem_tensor:        The GMEM tensor to be loaded by this copy atom
+    :type gmem_tensor:         Tensor
+    :param smem_layout:        Shared memory layout to load the tensor into (PDSL)
+    :type smem_layout:         Union[Layout, core.ComposedLayout]
+    :param mma_tiler_mnk:      The MMA Tiler shape (TILE_M, TILE_N, TILE_K) in MNK dimensions
+    :type mma_tiler_mnk:       Shape
+    :param tiled_mma:          The TiledMMA that will consume the load as operands
+    :type tiled_mma:           core.TiledMma
+    :param cluster_shape_vmnk: The Cluster-level shape in VMNK dimensions
+    :type cluster_shape_vmnk:  Shape
+    :param internal_type:      An optional parameter for the internal data type to when element
+                               type does not match the copy type
+    :type internal_type:       Type[Numeric]
+    :return:                   A Copy Atom for this Operation and the associated TMA tensor
+    :rtype:                    Tuple[core.CopyAtom, Tensor]
+    """
+    if internal_type is not None:
+        if not isinstance(internal_type, NumericMeta):
+            raise TypeError(f"internal_type must be a Numeric, but got {internal_type}")
+        internal_type = internal_type.mlir_type
+    check_type_in(
+        op,
+        [CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp],
+        "op",
+        "make_tiled_tma_atom_B",
+    )
+    ident = core.make_identity_layout(gmem_tensor.shape, loc=loc, ip=ip)
+    mma_tiler_nk = (mma_tiler_mnk[1], *mma_tiler_mnk[2:])
+    g_tile = core.composition(ident, mma_tiler_nk, loc=loc, ip=ip)
+    cta_v_map = tiled_mma._thrfrg_B(g_tile)
+    cta_v_map = core.get(cta_v_map, mode=[1])
+    cta_v_map = core.dice(cta_v_map, (1, (1,) * core.rank(g_tile)))
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        num_multicast = 1
+    else:
+        assert isinstance(op, CopyBulkTensorTileG2SMulticastOp)
+        # multicast across the M-mode since those would share the same tile of B
+        num_multicast = core.size(cluster_shape_vmnk, mode=[1])
+    # res[0] = the IR Value for the non-executable atom instance
+    # res[1] = the IR Value for the associated TMA tensor
+    res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load(
+        gmem_tensor.value,
+        smem_layout,
+        cta_v_map,
+        op._to_ir(),
+        num_multicast=num_multicast,
+        internal_type=internal_type,
+        loc=loc,
+        ip=ip,
+    )
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        return core.CopyAtom(op, CopyBulkTensorTileG2SNonExecTrait(res[0])), res[1]
+    else:
+        assert isinstance(op, CopyBulkTensorTileG2SMulticastOp)
+        return (
+            core.CopyAtom(op, CopyBulkTensorTileG2SMulticastNonExecTrait(res[0])),
+            res[1],
+        )
+__all__ = [
+    "make_tiled_tma_atom_A",
+    "make_tiled_tma_atom_B",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from .copy import *
+from .mma import *
+from .helpers import *
+# __all__ is required here for documentation generation
+__all__ = [
+    #
+    # copy.py
+    #
+    "Repetition",
+    "Pack",
+    "Unpack",
+    "Ld16x64bOp",
+    "Ld16x128bOp",
+    "Ld16x256bOp",
+    "Ld16x32bx2Op",
+    "Ld32x32bOp",
+    "St16x64bOp",
+    "St16x128bOp",
+    "St16x256bOp",
+    "St16x32bx2Op",
+    "St32x32bOp",
+    #
+    # mma.py
+    #
+    "OperandMajorMode",
+    "OperandSource",
+    "CtaGroup",
+    "Field",
+    "MmaTF32Op",
+    "MmaF16BF16Op",
+    "MmaI8Op",
+    "MmaFP8Op",
+    "MmaMXF8Op",
+    "MmaMXF4Op",
+    "MmaMXF4NVF4Op",
+    "SmemLayoutAtomKind",
+    #
+    # helpers.py
+    #
+    "make_smem_layout_atom",
+    "tile_to_mma_shape",
+    "commit",
+    "is_tmem_load",
+    "is_tmem_store",
+    "get_tmem_copy_properties",
+    "find_tmem_tensor_col_offset",
+    "make_tmem_copy",
+    "make_s2t_copy",
+    "get_s2t_smem_desc_tensor",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py ADDED Viewed

	@@ -0,0 +1,663 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+import enum
+from dataclasses import dataclass
+from typing import Type
+from cutlass.cutlass_dsl import CuTeDSL
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+from ..common import OpError
+from ...core import CopyOp, Trait
+from ...typing import Numeric
+from .mma import CtaGroup
+class Repetition(enum.Enum):
+    """
+    An enumeration for the number of repetitions of a given TMEM copy within the instruction.
+    """
+    x1 = 1
+    x2 = 2
+    x4 = 4
+    x8 = 8
+    x16 = 16
+    x32 = 32
+    x64 = 64
+    x128 = 128
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+    @classmethod
+    def _missing_(cls, value):
+        if isinstance(value, int):
+            if value == 1:
+                return Repetition.x1
+            elif value == 2:
+                return Repetition.x2
+            elif value == 8:
+                return Repetition.x8
+            elif value == 16:
+                return Repetition.x16
+            elif value == 32:
+                return Repetition.x32
+            elif value == 64:
+                return Repetition.x64
+            elif value == 128:
+                return Repetition.x128
+class Pack(enum.Enum):
+    """
+    An enumeration for the possible packing patterns for TMEM to RMEM copies.
+    """
+    NONE = enum.auto()
+    PACK_16b_IN_32b = enum.auto()
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+class Unpack(enum.Enum):
+    """
+    An enumeration for the possible unpacking patterns for RMEM to TMEM copies.
+    """
+    NONE = enum.auto()
+    UNPACK_32b_IN_16b = enum.auto()
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+@dataclass(frozen=True)
+class _LdBase(CopyOp):
+    repeat: Repetition = Repetition.x1
+    pack: Pack = Pack.NONE
+    admissible_archs = [
+        "sm_100a",
+        "sm_100f",
+    ]
+    def __post_init__(self) -> None:
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        if not isinstance(self.repeat, Repetition):
+            raise OpError(
+                self,
+                "expects the 'repeat' Op parameter to be a tcgen05.Repetition instance",
+            )
+        if not isinstance(self.pack, Pack):
+            raise OpError(
+                self,
+                "expects the 'pack' Op parameter to be a tcgen05.Pack instance",
+            )
+    def __str__(self) -> str:
+        res = (
+            f"tcgen05 {self.__class__.__name__[:-2]} Copy Operation"
+            + f"\n  number of repetitions = {self.repeat.value}"
+        )
+        if self.pack == Pack.PACK_16b_IN_32b:
+            res += f"\n  with 2x 16-bit to 32b packing"
+        return res
+@dataclass(frozen=True)
+class Ld16x64bOp(_LdBase):
+    """
+    16x64b TMEM load Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.16x64b`` qualifier.
+    """
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld16x64bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            16,
+            64,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld16x64bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class Ld16x64bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class Ld16x128bOp(_LdBase):
+    """
+    16x128b TMEM load Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.16x128b`` qualifier.
+    """
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.repeat == Repetition.x128:
+            raise OpError(
+                self,
+                "x128 repetition is not supported",
+                suggestion="choose one of x1, x2, x4, x8, x16, x32, x64",
+            )
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld16x128bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            16,
+            128,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld16x128bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class Ld16x128bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class Ld16x256bOp(_LdBase):
+    """
+    16x256b TMEM load Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.16x256b`` qualifier.
+    """
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.repeat in (Repetition.x128, Repetition.x64):
+            raise OpError(
+                self,
+                "x64 and x128 repetition is not supported",
+                suggestion="choose one of x1, x2, x4, x8, x16, x32",
+            )
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld16x256bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            16,
+            256,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld16x256bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class Ld16x256bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class Ld16x32bx2Op(_LdBase):
+    """
+    16x32bx2 TMEM load Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.16x32bx2`` qualifier.
+    """
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld16x32bx2Trait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            16,
+            32,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld16x32bx2Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class Ld16x32bx2Trait(Trait):
+    pass
+@dataclass(frozen=True)
+class Ld32x32bOp(_LdBase):
+    """
+    32x32b TMEM load Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.32x32`` qualifier.
+    """
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld32x32bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            32,
+            32,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld32x32bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class Ld32x32bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class _StBase(CopyOp):
+    repeat: Repetition
+    unpack: Unpack = Unpack.NONE
+    admissible_archs = [
+        "sm_100a",
+        "sm_100f",
+    ]
+    def __post_init__(self) -> None:
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        if not isinstance(self.repeat, Repetition):
+            raise OpError(
+                self,
+                "expects the 'repeat' Op parameter to be a tcgen05.Repetition instance",
+            )
+        if not isinstance(self.unpack, Unpack):
+            raise OpError(
+                self,
+                "expects the 'pack' Op parameter to be a tcgen05.Unpack instance",
+            )
+    def __str__(self) -> str:
+        res = (
+            f"tcgen05 {self.__class__.__name__[:-2]} Copy Operation"
+            + f"\n  number of repetitions = {self.repeat.value}"
+        )
+        if self.unpack == Unpack.UNPACK_32b_IN_16b:
+            res += f"\n  with 32-bit to 2x 16b unpacking"
+        return res
+@dataclass(frozen=True)
+class St16x64bOp(_StBase):
+    """
+    16x64b TMEM store Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.16x64`` qualifier.
+    """
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St16x64bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            16,
+            64,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St16x64bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class St16x64bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class St16x128bOp(_StBase):
+    """
+    16x128b TMEM store Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.16x128`` qualifier.
+    """
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.repeat == Repetition.x128:
+            raise OpError(
+                self,
+                "x128 repetition is not supported",
+                suggestion="choose one of x1, x2, x4, x8, x16, x32, x64",
+            )
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St16x128bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            16,
+            128,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St16x128bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class St16x128bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class St16x256bOp(_StBase):
+    """
+    16x256b TMEM store Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.16x256`` qualifier.
+    """
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.repeat in (Repetition.x128, Repetition.x64):
+            raise OpError(
+                self,
+                "x64 and x128 repetition is not supported",
+                suggestion="choose one of x1, x2, x4, x8, x16, x32",
+            )
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St16x256bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            16,
+            256,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St16x256bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class St16x256bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class St16x32bx2Op(_StBase):
+    """
+    16x32x2b TMEM store Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.16x32x2`` qualifier.
+    """
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St16x32bx2Trait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            16,
+            32,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St16x32bx2Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class St16x32bx2Trait(Trait):
+    pass
+@dataclass(frozen=True)
+class St32x32bOp(_StBase):
+    """
+    32x32b TMEM store Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.32x32`` qualifier.
+    """
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St32x32bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            32,
+            32,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St32x32bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class St32x32bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class _S2TCopyBase(CopyOp):
+    cta_group: CtaGroup
+    admissible_archs = [
+        "sm_100a",
+        "sm_100f",
+    ]
+    def __post_init__(self) -> None:
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self,
+                "expects the 'cta_group' Op parameter to be a tcgen05.CtaGroup instance",
+            )
+    def __str__(self) -> str:
+        res = (
+            f"tcgen05 {self.__class__.__name__[:-2]} Copy Operation"
+            + f"\n  CTA group = {self.cta_group}"
+        )
+        return res
+@dataclass(frozen=True)
+class Cp128x256bOp(_S2TCopyBase):
+    """
+    128x256b SMEM to TMEM Copy Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=tcgen05#tcgen05-instructions-tcgen05-cp>`__.
+    This Operation corresponds to the ``.128x256b`` qualifier.
+    """
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Cp128x256bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100CopyS2TType.get(
+            copy_internal_type.mlir_type,
+            128,
+            256,
+            self.cta_group.value,
+            _cute_nvgpu_ir.CopyS2TBroadcast.none,
+        )
+        return Cp128x256bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class Cp128x256bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class Cp128x128bOp(_S2TCopyBase):
+    """
+    128x128b SMEM to TMEM Copy Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=tcgen05#tcgen05-instructions-tcgen05-cp>`__.
+    This Operation corresponds to the ``.128x128b`` qualifier.
+    """
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Cp128x128bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100CopyS2TType.get(
+            copy_internal_type.mlir_type,
+            128,
+            128,
+            self.cta_group.value,
+            _cute_nvgpu_ir.CopyS2TBroadcast.none,
+        )
+        return Cp128x128bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class Cp128x128bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class Cp4x256bOp(_S2TCopyBase):
+    """
+    4x256b SMEM to TMEM Copy Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=tcgen05#tcgen05-instructions-tcgen05-cp>`__.
+    This Operation corresponds to the ``.4x256b`` qualifier.
+    """
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Cp4x256bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100CopyS2TType.get(
+            copy_internal_type.mlir_type,
+            4,
+            256,
+            self.cta_group.value,
+            _cute_nvgpu_ir.CopyS2TBroadcast.none,
+        )
+        return Cp4x256bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class Cp4x256bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class Cp4x32x128bOp(_S2TCopyBase):
+    """
+    32x128b SMEM to TMEM Copy Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=tcgen05#tcgen05-instructions-tcgen05-cp>`__.
+    This Operation corresponds to the ``.32x128b`` qualifier with ``warpx4`` broadcast qualifier enabled.
+    """
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Cp4x32x128bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100CopyS2TType.get(
+            copy_internal_type.mlir_type,
+            32,
+            128,
+            self.cta_group.value,
+            _cute_nvgpu_ir.CopyS2TBroadcast.x4,
+        )
+        return Cp4x32x128bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class Cp4x32x128bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class Cp2x64x128b0213Op(_S2TCopyBase):
+    """
+    64x128b SMEM to TMEM Copy Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=tcgen05#tcgen05-instructions-tcgen05-cp>`__.
+    This Operation corresponds to the ``.64x128b`` qualifier with ``.warpx2::02_13`` broadcast qualifier enabled.
+    """
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Cp2x64x128b0213Trait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100CopyS2TType.get(
+            copy_internal_type.mlir_type,
+            64,
+            128,
+            self.cta_group.value,
+            _cute_nvgpu_ir.CopyS2TBroadcast.lw_0213,
+        )
+        return Cp2x64x128b0213Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class Cp2x64x128b0213Trait(Trait):
+    pass
+@dataclass(frozen=True)
+class Cp2x64x128b0123Op(_S2TCopyBase):
+    """
+    64x128b SMEM to TMEM Copy Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=tcgen05#tcgen05-instructions-tcgen05-cp>`__.
+    This Operation corresponds to the ``.64x128b`` qualifier with ``.warpx2::01_23`` broadcast qualifier enabled.
+    """
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Cp2x64x128b0123Trait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100CopyS2TType.get(
+            copy_internal_type.mlir_type,
+            64,
+            128,
+            self.cta_group.value,
+            _cute_nvgpu_ir.CopyS2TBroadcast.lw_0123,
+        )
+        return Cp2x64x128b0123Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class Cp2x64x128b0123Trait(Trait):
+    pass

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py ADDED Viewed

	@@ -0,0 +1,328 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from typing import overload, Type, Tuple, Union
+from cutlass.cutlass_dsl import dsl_user_op
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir.dialects import nvvm
+from ...typing import (
+    Shape,
+    IntTuple,
+    Layout,
+    Tensor,
+    Int,
+    Numeric,
+    NumericMeta,
+    Int16,
+    Int32,
+)
+from ... import core
+from .mma import SmemLayoutAtomKind, CtaGroup
+from .copy import (
+    Pack,
+    Unpack,
+    Ld16x64bOp,
+    Ld16x128bOp,
+    Ld16x256bOp,
+    Ld16x32bx2Op,
+    Ld32x32bOp,
+    St16x64bOp,
+    St16x128bOp,
+    St16x256bOp,
+    St16x32bx2Op,
+    St32x32bOp,
+)
+####################################################################################################
+#
+# Helper functions for MMA
+#
+####################################################################################################
+@dsl_user_op
+def make_smem_layout_atom(
+    kind: SmemLayoutAtomKind, element_type: Type[Numeric], *, loc=None, ip=None
+) -> core.ComposedLayout:
+    """
+    Makes a SMEM layout Atom.
+    This function creates a composed layout in unit of elements consistent with the requested layout
+    Atom kind and element data type.
+    :param kind:         The kind of layout Atom
+    :type kind:          SmemLayoutAtomKind
+    :param element_type: The element data type to construct the layout for
+    :type element_type:  Type[Numeric]
+    :return:             The SMEM layout atom
+    :rtype:              core.ComposedLayout
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(f"element_type must be a Numeric, but got {element_type}")
+    if kind in (SmemLayoutAtomKind.MN_INTER, SmemLayoutAtomKind.K_INTER):
+        num_contiguous_bits = 128
+        sw = core.make_swizzle(0, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW32, SmemLayoutAtomKind.K_SW32):
+        num_contiguous_bits = 256
+        sw = core.make_swizzle(1, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW64, SmemLayoutAtomKind.K_SW64):
+        num_contiguous_bits = 512
+        sw = core.make_swizzle(2, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW128, SmemLayoutAtomKind.K_SW128):
+        num_contiguous_bits = 1024
+        sw = core.make_swizzle(3, 4, 3)
+    elif kind == SmemLayoutAtomKind.MN_SW128_32B:
+        num_contiguous_bits = 1024
+        sw = core.make_swizzle(2, 5, 2)
+    else:
+        raise ValueError("unrecognized SMEM layout atom kind")
+    num_contiguous_elems = num_contiguous_bits // element_type.width
+    if kind in (
+        SmemLayoutAtomKind.MN_INTER,
+        SmemLayoutAtomKind.MN_SW32,
+        SmemLayoutAtomKind.MN_SW64,
+        SmemLayoutAtomKind.MN_SW128,
+        SmemLayoutAtomKind.MN_SW128_32B,
+    ):
+        # M/N-major layout
+        return core.make_composed_layout(
+            sw,
+            0,
+            core.make_layout(
+                (num_contiguous_elems, 8), stride=(1, num_contiguous_elems)
+            ),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        # K-major layout
+        return core.make_composed_layout(
+            sw,
+            0,
+            core.make_layout(
+                (8, num_contiguous_elems), stride=(num_contiguous_elems, 1)
+            ),
+            loc=loc,
+            ip=ip,
+        )
+@overload
+def tile_to_mma_shape(
+    atom: Layout, mma_tile_shape: Shape, order: IntTuple = None, *, loc=None, ip=None
+) -> Layout: ...
+@overload
+def tile_to_mma_shape(
+    atom: core.ComposedLayout,
+    mma_tile_shape: Shape,
+    order: IntTuple = None,
+    *,
+    loc=None,
+    ip=None,
+) -> core.ComposedLayout: ...
+@dsl_user_op
+def tile_to_mma_shape(
+    atom, mma_tile_shape: Shape, order: IntTuple = None, *, loc=None, ip=None
+):
+    """
+    Tiles a layout to an MMA shape.
+    """
+    # Default order is colexicographical
+    if order is None:
+        order = tuple(range(core.rank(mma_tile_shape) - 1))
+    if core.rank(order) != core.rank(mma_tile_shape) - 1:
+        raise ValueError(
+            f"rank(order)={core.rank(order)} must be equal to "
+            f"rank(mma_tile_shape)-1={core.rank(mma_tile_shape)-1}"
+        )
+    order_val = core._pack_int_tuple(order, loc=loc, ip=ip)
+    mma_tile_shape_val = core._pack_shape(mma_tile_shape, loc=loc, ip=ip)
+    if not (
+        core.is_static(atom)
+        and core.is_static(mma_tile_shape_val)
+        and core.is_static(order_val)
+    ):
+        raise ValueError("tile_to_mma_shape only supports static inputs")
+    res_ty = _cute_nvgpu_ir.tile_to_mma_shape(atom, mma_tile_shape_val, order_val)
+    return _cute_ir.static(res_ty, loc=loc, ip=ip)
+@dsl_user_op
+def commit(
+    mbar_ptr: core.Pointer,
+    mask=None,
+    cta_group: CtaGroup = CtaGroup.ONE,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Perform an arrive operation on a mbarrier upon completion of previous MMA operations.
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param mask:     An optional multicast mask for the CTAs in the cluster to signal arrival to
+    :type mask:      Int
+    """
+    if cta_group == CtaGroup.ONE:
+        group = nvvm.Tcgen05GroupKind.CTA_1
+    else:
+        assert cta_group == CtaGroup.TWO
+        group = nvvm.Tcgen05GroupKind.CTA_2
+    mbar_ptr = mbar_ptr.llvm_ptr
+    if mask is not None:
+        mask = Int16(mask).ir_value(loc=loc, ip=ip)
+        nvvm.tcgen05_commit_arrive(
+            mbar_ptr, multicast_mask=mask, group=group, loc=loc, ip=ip
+        )
+    else:
+        nvvm.tcgen05_commit_arrive(mbar_ptr, group=group, loc=loc, ip=ip)
+    return
+####################################################################################################
+#
+# Helper functions for Copies
+#
+####################################################################################################
+def is_tmem_load(atom: core.CopyAtom) -> bool:
+    """
+    Returns whether a CopyAtom instance is a TMEM load.
+    """
+    return isinstance(
+        atom.op,
+        (
+            Ld16x64bOp,
+            Ld16x128bOp,
+            Ld16x256bOp,
+            Ld16x32bx2Op,
+            Ld32x32bOp,
+        ),
+    )
+def is_tmem_store(atom: core.CopyAtom) -> bool:
+    """
+    Returns whether a CopyAtom instance is a TMEM store.
+    """
+    return isinstance(
+        atom.op,
+        (
+            St16x64bOp,
+            St16x128bOp,
+            St16x256bOp,
+            St16x32bx2Op,
+            St32x32bOp,
+        ),
+    )
+def get_tmem_copy_properties(
+    atom: core.CopyAtom,
+) -> Tuple[int, int, int, Union[Pack, Unpack]]:
+    """
+    Returns the properties of a TMEM copy atom (number of data paths, bits, repetitions,
+    and whether packing/unpacking is used).
+    """
+    if isinstance(atom.op, (Ld16x64bOp, St16x64bOp)):
+        num_dp, num_bits = 16, 64
+    elif isinstance(atom.op, (Ld16x128bOp, St16x128bOp)):
+        num_dp, num_bits = 16, 128
+    elif isinstance(atom.op, (Ld16x256bOp, St16x256bOp)):
+        num_dp, num_bits = 16, 256
+    elif isinstance(atom.op, (Ld16x32bx2Op, St16x32bx2Op)):
+        num_dp, num_bits = 16, 32
+    elif isinstance(atom.op, (Ld32x32bOp, St32x32bOp)):
+        num_dp, num_bits = 32, 32
+    else:
+        raise ValueError(f"expects 'atom' to be a TMEM copy, but got {atom}")
+    if is_tmem_load(atom):
+        return num_dp, num_bits, atom.op.repeat.value, atom.op.pack
+    else:
+        assert is_tmem_store(atom), "atom must be a TMEM store"
+        return num_dp, num_bits, atom.op.repeat.value, atom.op.unpack
+@dsl_user_op
+def find_tmem_tensor_col_offset(tmem_tensor: Tensor, *, loc=None, ip=None) -> Int:
+    """
+    Computes the TMEM column offset given a TMEM tensor.
+    :param tmem_tensor: The TMEM tensor to use to compute the columns offset
+    :type tmem_tensor:  Tensor
+    :return:            The columns offset
+    :rtype:             Int
+    """
+    tmem_col_mask = 0x0000FFFF
+    offset = (
+        core.cosize(core.recast_tensor(tmem_tensor, Int32).layout, loc=loc, ip=ip)
+        & tmem_col_mask
+    )
+    if isinstance(offset, int):
+        return offset
+    return Int32(offset, loc=loc, ip=ip)
+@dsl_user_op
+def make_tmem_copy(
+    atom: core.CopyAtom, tmem_tensor: Tensor, *, loc=None, ip=None
+) -> core.TiledCopy:
+    """
+    Makes a Tiled Copy instance from a TMEM Copy Atom and a TMEM tensor.
+    """
+    tiled_copy_val = _cute_nvgpu_ir.atom_make_tmem_copy(
+        atom._trait.value, tmem_tensor.value, loc=loc, ip=ip
+    )
+    new_trait = type(atom._trait)(tiled_copy_val)
+    return core.TiledCopy(atom.op, new_trait)
+@dsl_user_op
+def make_s2t_copy(
+    atom: core.CopyAtom, tmem_tensor: Tensor, *, loc=None, ip=None
+) -> core.TiledCopy:
+    """
+    Makes a Tiled Copy instance from a TMEM Copy Atom and a TMEM tensor.
+    """
+    tiled_copy_val = _cute_nvgpu_ir.atom_make_s2t_copy(
+        atom._trait.value, tmem_tensor.value, loc=loc, ip=ip
+    )
+    new_trait = type(atom._trait)(tiled_copy_val)
+    return core.TiledCopy(atom.op, new_trait)
+@dsl_user_op
+def get_s2t_smem_desc_tensor(
+    atom: core.CopyAtom, smem_tensor: Tensor, *, loc=None, ip=None
+) -> Tensor:
+    """
+    Returns the SMEM descriptor tensor from a S2T copy atom and a SMEM tensor.
+    """
+    smem_desc_tensor = _cute_nvgpu_ir.atom_get_copy_s2t_smem_desc_view(
+        atom._trait.value, smem_tensor.value, loc=loc, ip=ip
+    )
+    return smem_desc_tensor

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py ADDED Viewed

	@@ -0,0 +1,1041 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+import enum
+from dataclasses import dataclass
+from typing import Type
+from cutlass.cutlass_dsl import CuTeDSL, T
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+from ..common import OpError
+from ... import core
+from ...core import Trait, _pack_shape, rank, depth, _Tensor
+from ...typing import (
+    Shape,
+    Float4E2M1FN,
+    Float8E8M0FNU,
+    Float8E5M2,
+    Float8E4M3FN,
+    Float16,
+    BFloat16,
+    Float32,
+    TFloat32,
+    Boolean,
+    Int8,
+    Uint8,
+    Int32,
+    Numeric,
+    AddressSpace,
+    Pointer,
+)
+####################################################################################################
+#
+# MMA Ops and Traits
+#
+####################################################################################################
+class OperandMajorMode(enum.Enum):
+    """
+    An enumeration for the majorness of the input operands of the MMA.
+    """
+    MN = _cute_ir.MajorMode.mn
+    K = _cute_ir.MajorMode.k
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+    @classmethod
+    def _missing_(cls, value):
+        if isinstance(value, str):
+            value = value.upper()
+            if value == "MN":
+                return OperandMajorMode.MN
+            elif value == "K":
+                return OperandMajorMode.K
+    def _to_ir(self) -> _cute_ir.MajorMode:
+        return self.value
+class OperandSource(enum.Enum):
+    """
+    An enumeration for the source memory location of the A input operand of the MMA.
+    """
+    TMEM = _cute_ir.MmaFragKind.tmem
+    SMEM = _cute_ir.MmaFragKind.smem_desc
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+    def _to_ir(self) -> _cute_ir.MmaFragKind:
+        return self.value
+class CtaGroup(enum.Enum):
+    """
+    An enumeration for the ``cta_group``  qualifier of the MMA.
+    """
+    ONE = 1
+    TWO = 2
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+class Field(enum.Enum):
+    """
+    An enumeration for the fields of the MMA Atom that can be modified at runtime.
+    """
+    NEGATE_A = "neg_a"
+    NEGATE_B = "neg_b"
+    ACCUMULATE = "accum_c"
+    SFA = "sf_a"
+    SFB = "sf_b"
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+    def _to_ir_field_name(self) -> str:
+        return self.value
+# Base class for all tcgen05 MMA Ops with syntax `tcgen05.mma.cta_group.kind` used to factor out some internal code
+@dataclass(frozen=True)
+class MmaOp(core.MmaOp):
+    a_dtype: Type[Numeric]
+    b_dtype: Type[Numeric]
+    acc_dtype: Type[Numeric]
+    shape_mnk: Shape
+    cta_group: CtaGroup
+    a_src: OperandSource
+    a_major_mode: OperandMajorMode
+    b_major_mode: OperandMajorMode
+    admissible_archs = [
+        "sm_100a",
+        "sm_100f",
+    ]
+    def __post_init__(self) -> None:
+        # Verify arch
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self,
+                "expects the 'cta_group' Op parameter to be a tcgen05.CtaGroup instance",
+            )
+        if not isinstance(self.a_src, OperandSource):
+            raise OpError(
+                self,
+                "expects the 'a_src' Op parameter to be a tcgen05.OperandSource instance",
+            )
+        if not isinstance(self.a_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'a_major_mode' Op parameter to be a tcgen05.OperandMajorMode instance",
+            )
+        if not isinstance(self.b_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'b_major_mode' Op parameter to be a tcgen05.OperandMajorMode instance",
+            )
+        # Verify the instruction shape
+        if (rank(self.shape_mnk) not in [2, 3]) or (depth(self.shape_mnk) != 1):
+            raise OpError(
+                self,
+                f"expected a flat rank 2 or 3 tuple for the 'shape_mnk' Op parameter, "
+                f"but got {self.shape_mnk}",
+            )
+        m, n = self.shape_mnk[0], self.shape_mnk[1]
+        if self.cta_group == CtaGroup.ONE:
+            if m not in [64, 128]:
+                raise OpError(self, f"expects the M-mode to be 64 or 128, but got {m}")
+            if m == 64:
+                if (n < 8) or (n > 256) or (n % 8 != 0):
+                    raise OpError(
+                        self,
+                        f"expects the N-mode to satisfy 8 <= N <= 256 and N % 8 == 0, but got {n}",
+                    )
+            elif m == 128:
+                if (n < 16) or (n > 256) or (n % 16 != 0):
+                    raise OpError(
+                        self,
+                        f"expects the N-mode to satisfy 8 <= N <= 256 and N % 16 == 0, but got {n}",
+                    )
+        else:
+            if m not in [128, 256]:
+                raise OpError(self, f"expects the M-mode to be 128 or 256, but got {m}")
+            if (n < 32) or (n > 256) or (n % 32 != 0):
+                raise OpError(
+                    self,
+                    f"expects the N-mode to satisfy 32 <= N <= 256 and N % 32 == 0, but got {n}",
+                )
+    def __str__(self) -> str:
+        return (
+            self.__class__.descriptive_name  # type: ignore
+            + f"\n  A data type           = {self.a_dtype}"
+            + f"\n  B data type           = {self.b_dtype}"
+            + f"\n  Accumulator data type = {self.acc_dtype}"
+            + f"\n  CTA group             = {self.cta_group}"
+            + f"\n  A source location     = {self.a_src}"
+            + f"\n  A major mode          = {self.a_major_mode}"
+            + f"\n  B major mode          = {self.b_major_mode}"
+            + f"\n  Instruction shape MNK = {self.shape_mnk}"
+        )
+    def _verify_fragment_A(self, input: _Tensor, *, loc=None, ip=None):
+        if input.memspace == AddressSpace.smem and isinstance(
+            input.layout.type, _cute_ir.ComposedLayoutType
+        ):
+            raise OpError(
+                self,
+                f"Expected affine layout for {self._make_trait()}'s operand A, "
+                f"but got composed layout instead: {input.layout}"
+                f"\nPlease use recast_ptr(ptr, {input.layout.inner}, element_type) operation to move swizzle to the ptr",
+            )
+        return True
+    def _verify_fragment_B(self, input: _Tensor, *, loc=None, ip=None):
+        if input.memspace == AddressSpace.smem and isinstance(
+            input.layout.type, _cute_ir.ComposedLayoutType
+        ):
+            raise OpError(
+                self,
+                f"Expected affine layout for {self._make_trait()}'s operand B, "
+                f"but got composed layout instead: {input.layout}"
+                f"\nPlease use recast_ptr(ptr, {input.layout.inner}, element_type) operation to move swizzle to the ptr",
+            )
+        return True
+class MmaTrait(Trait):
+    admissible_fields = [Field.ACCUMULATE, Field.NEGATE_A, Field.NEGATE_B]
+    def set(self, field, value, *, loc=None, ip=None) -> None:
+        if field not in self.admissible_fields:
+            raise ValueError(
+                f"expects field to be one of {self.admissible_fields}, but got {field}"
+            )
+        field_name = f"#cute_nvgpu.atom_mma_field_sm100<{field._to_ir_field_name()}>"
+        attr = ir.Attribute.parse(field_name)
+        self.value = _cute_nvgpu_ir.atom_set_value(
+            self.value, attr, Boolean(value).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+# Base class for all tcgen05 BlockScaled MMA Ops with syntax `tcgen05.mma.cta_group.kind.block_scale` used to factor out some internal code
+@dataclass(frozen=True)
+class BlockScaledMmaOp(core.MmaOp):
+    a_dtype: Type[Numeric]
+    b_dtype: Type[Numeric]
+    acc_dtype: Float32
+    sf_dtype: Type[Numeric]
+    sf_vec_size: int
+    shape_mnk: Shape
+    cta_group: CtaGroup
+    a_src: OperandSource
+    a_major_mode: OperandMajorMode
+    b_major_mode: OperandMajorMode
+    admissible_archs = [
+        "sm_100a",
+    ]
+    def __post_init__(self) -> None:
+        # Verify arch
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self,
+                "expects the 'cta_group' Op parameter to be a tcgen05.CtaGroup instance",
+            )
+        if not isinstance(self.a_src, OperandSource):
+            raise OpError(
+                self,
+                "expects the 'a_src' Op parameter to be a tcgen05.OperandSource instance",
+            )
+        if not isinstance(self.a_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'a_major_mode' Op parameter to be a tcgen05.OperandMajorMode instance",
+            )
+        if not isinstance(self.b_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'b_major_mode' Op parameter to be a tcgen05.OperandMajorMode instance",
+            )
+        # Verify the instruction shape
+        if (rank(self.shape_mnk) not in [2, 3]) or (depth(self.shape_mnk) != 1):
+            raise OpError(
+                self,
+                f"expected a flat rank 2 or 3 tuple for the 'shape_mnk' Op parameter, "
+                f"but got {self.shape_mnk}",
+            )
+        m, n = self.shape_mnk[0], self.shape_mnk[1]
+        if self.cta_group == CtaGroup.ONE:
+            if m != 128:
+                raise OpError(self, f"expects the M-mode to be 128, but got {m}")
+            if (n < 8) or (n > 256) or (n % 8 != 0):
+                raise OpError(
+                    self,
+                    f"expects the N-mode to satisfy 8 <= N <= 256 and N % 8 == 0, but got {n}",
+                )
+        else:
+            if m not in [128, 256]:
+                raise OpError(self, f"expects the M-mode to be 128 or 256, but got {m}")
+            if (n < 16) or (n > 256) or (n % 16 != 0):
+                raise OpError(
+                    self,
+                    f"expects the N-mode to satisfy 16 <= N <= 256 and N % 16 == 0, but got {n}",
+                )
+        if self.sf_vec_size not in [16, 32]:
+            raise OpError(
+                self,
+                f"expects the scale factor vector size to be 16 or 32, but got {self.sf_vec_size}",
+            )
+    def __str__(self) -> str:
+        return (
+            self.__class__.descriptive_name  # type: ignore
+            + f"\n  A data type               = {self.a_dtype}"
+            + f"\n  B data type               = {self.b_dtype}"
+            + f"\n  Accumulator data type     = {self.acc_dtype}"
+            + f"\n  Scale factor data type    = {self.sf_dtype}"
+            + f"\n  Scale factor vector size  = {self.sf_vec_size}"
+            + f"\n  CTA group                 = {self.cta_group}"
+            + f"\n  A source location         = {self.a_src}"
+            + f"\n  A major mode              = {self.a_major_mode}"
+            + f"\n  B major mode              = {self.b_major_mode}"
+            + f"\n  Instruction shape MNK     = {self.shape_mnk}"
+        )
+    def _verify_fragment_A(self, input: _Tensor, *, loc=None, ip=None):
+        if input.memspace == AddressSpace.smem and isinstance(
+            input.layout.type, _cute_ir.ComposedLayoutType
+        ):
+            raise OpError(
+                self,
+                f"Expected affine layout for {self._make_trait()}'s operand A, "
+                f"but got composed layout instead: {input.layout}"
+                f"\nPlease use recast_ptr(ptr, {input.layout.inner}, element_type) operation to move swizzle to the ptr",
+            )
+        return True
+    def _verify_fragment_B(self, input: _Tensor, *, loc=None, ip=None):
+        if input.memspace == AddressSpace.smem and isinstance(
+            input.layout.type, _cute_ir.ComposedLayoutType
+        ):
+            raise OpError(
+                self,
+                f"Expected affine layout for {self._make_trait()}'s operand B, "
+                f"but got composed layout instead: {input.layout}"
+                f"\nPlease use recast_ptr(ptr, {input.layout.inner}, element_type) operation to move swizzle to the ptr",
+            )
+        return True
+class BlockScaledMmaTraits(Trait):
+    admissible_fields = [
+        Field.ACCUMULATE,
+        Field.NEGATE_A,
+        Field.NEGATE_B,
+        Field.SFA,
+        Field.SFB,
+    ]
+    def set(self, field, value, *, loc=None, ip=None) -> None:
+        if field not in self.admissible_fields:
+            raise ValueError(
+                f"expects field to be one of {self.admissible_fields}, but got {field}"
+            )
+        if field in [Field.ACCUMULATE, Field.NEGATE_A, Field.NEGATE_B]:
+            value = Boolean(value).ir_value(loc=loc, ip=ip)
+        elif field in [Field.SFA, Field.SFB]:
+            if not isinstance(value, Pointer):
+                raise ValueError(
+                    f"expects value to be a pointer for {field}, but got {type(value).__name__}"
+                )
+            value = value.value
+        field_name = f"#cute_nvgpu.atom_mma_field_sm100_block_scaled<{field._to_ir_field_name()}>"
+        attr = ir.Attribute.parse(field_name)
+        self.value = _cute_nvgpu_ir.atom_set_value(
+            self.value, attr, value, loc=loc, ip=ip
+        )
+#
+# TF32 MMA
+#
+@dataclass(frozen=True)
+class MmaTF32Op(MmaOp):
+    """
+    TF32 tcgen05 MMA Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::tf32`` qualifier.
+    """
+    descriptive_name = "tcgen05 TF32 MMA Operation"
+    def __init__(
+        self,
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            TFloat32,
+            TFloat32,
+            Float32,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+    def _verify(self) -> None:
+        # Verify the instruction shape
+        instruction_k = 8
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaTF32Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+            0,
+        )
+        return MmaTF32Trait(
+            _cute_nvgpu_ir.make_sm100_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+class MmaTF32Trait(MmaTrait):
+    pass
+#
+# F16/BF16 MMA
+#
+@dataclass(frozen=True)
+class MmaF16BF16Op(MmaOp):
+    """
+    F16/BF16 tcgen05 MMA Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::f16`` qualifier.
+    """
+    descriptive_name = "tcgen05 F16/BF16 MMA Operation"
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        acc_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            acc_dtype,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Float16, BFloat16]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float16 or BFloat16",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Accumulator data type verification
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        # Instruction shape verification
+        instruction_k = 16
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF16BF16Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+            0,
+        )
+        return MmaF16BF16Trait(
+            _cute_nvgpu_ir.make_sm100_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+class MmaF16BF16Trait(MmaTrait):
+    pass
+#
+# I8 MMA
+#
+@dataclass(frozen=True)
+class MmaI8Op(MmaOp):
+    """
+    I8 tcgen05 MMA Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::i8`` qualifier.
+    """
+    descriptive_name = "tcgen05 I8 MMA Operation"
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            Int32,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Int8, Uint8]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Int8 or Uint8",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Instruction shape verification
+        instruction_k = 32
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaI8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            (T.si8() if self.a_dtype.signed else T.ui8()),
+            (T.si8() if self.b_dtype.signed else T.ui8()),
+            T.si32(),
+            self.a_src._to_ir(),
+            0,
+        )
+        return MmaI8Trait(
+            _cute_nvgpu_ir.make_sm100_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+class MmaI8Trait(MmaTrait):
+    pass
+#
+# F8F6F4 MMA
+#
+@dataclass(frozen=True)
+class MmaFP8Op(MmaOp):
+    """
+    F8 tcgen05 MMA Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    """
+    descriptive_name = "tcgen05 F8 MMA Operation"
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        acc_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            acc_dtype,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Float8E5M2, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Accumulator data type verification
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        # Instruction shape verification
+        instruction_k = 32
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaFP8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+            0,
+        )
+        return MmaFP8Trait(
+            _cute_nvgpu_ir.make_sm100_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+class MmaFP8Trait(MmaTrait):
+    pass
+#
+# MXF8F6F4 MMA
+#
+@dataclass(frozen=True)
+class MmaMXF8Op(BlockScaledMmaOp):
+    """
+    MXF8 tcgen05 BlockScaled MMA Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::mxf8f6f4`` qualifier.
+    """
+    descriptive_name = "tcgen05 MXF8 BlockScaled MMA Operation"
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            Float32,
+            Float8E8M0FNU,
+            32,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Float8E5M2, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Instruction shape verification
+        instruction_k = 32
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaMXF8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMABlockScaledType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.sf_dtype.mlir_type,
+            self.a_src._to_ir(),
+            self.sf_vec_size,
+        )
+        return MmaMXF8Trait(
+            _cute_nvgpu_ir.make_sm100_mma_bs(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                core.make_ptr(self.sf_dtype, 0, _cute_ir.AddressSpace.tmem).value,
+                core.make_ptr(self.sf_dtype, 0, _cute_ir.AddressSpace.tmem).value,
+                loc=loc,
+                ip=ip,
+            )
+        )
+class MmaMXF8Trait(BlockScaledMmaTraits):
+    pass
+#
+# MXF4 MMA
+#
+@dataclass(frozen=True)
+class MmaMXF4Op(BlockScaledMmaOp):
+    """
+    MXF4 tcgen05 BlockScaled MMA Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::mxf4`` qualifier.
+    """
+    descriptive_name = "tcgen05 MXF4 BlockScaled MMA Operation"
+    def __init__(
+        self,
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+    ) -> None:
+        super().__init__(
+            Float4E2M1FN,
+            Float4E2M1FN,
+            Float32,
+            Float8E8M0FNU,
+            32,
+            instruction_shape,
+            cta_group,
+            a_src,
+            OperandMajorMode.K,
+            OperandMajorMode.K,
+        )
+        self._verify()
+    def _verify(self) -> None:
+        # Instruction shape verification
+        instruction_k = 64
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaMXF8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMABlockScaledType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.sf_dtype.mlir_type,
+            self.a_src._to_ir(),
+            self.sf_vec_size,
+        )
+        return MmaMXF4Trait(
+            _cute_nvgpu_ir.make_sm100_mma_bs(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                core.make_ptr(self.sf_dtype, 0, _cute_ir.AddressSpace.tmem).value,
+                core.make_ptr(self.sf_dtype, 0, _cute_ir.AddressSpace.tmem).value,
+                loc=loc,
+                ip=ip,
+            )
+        )
+class MmaMXF4Trait(BlockScaledMmaTraits):
+    pass
+#
+# MXF4NVF4 MMA
+#
+@dataclass(frozen=True)
+class MmaMXF4NVF4Op(BlockScaledMmaOp):
+    """
+    MXF4NVF4 tcgen05 BlockScaled MMA Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::mxf4nvf4`` qualifier.
+    """
+    descriptive_name = "tcgen05 MXF4NVF4 BlockScaled MMA Operation"
+    def __init__(
+        self,
+        sf_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+    ) -> None:
+        super().__init__(
+            Float4E2M1FN,
+            Float4E2M1FN,
+            Float32,
+            sf_dtype,
+            16,
+            instruction_shape,
+            cta_group,
+            a_src,
+            OperandMajorMode.K,
+            OperandMajorMode.K,
+        )
+        self._verify()
+    def _verify(self) -> None:
+        # Scale Factor data type verification
+        if self.sf_dtype not in [Float8E8M0FNU, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'sf_dtype' Op parameter to be one of Float8E8M0FNU",
+            )
+        # Instruction shape verification
+        instruction_k = 64
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaMXF8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMABlockScaledType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.sf_dtype.mlir_type,
+            self.a_src._to_ir(),
+            self.sf_vec_size,
+        )
+        return MmaMXF4NVF4Trait(
+            _cute_nvgpu_ir.make_sm100_mma_bs(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                core.make_ptr(self.sf_dtype, 0, _cute_ir.AddressSpace.tmem).value,
+                core.make_ptr(self.sf_dtype, 0, _cute_ir.AddressSpace.tmem).value,
+                loc=loc,
+                ip=ip,
+            )
+        )
+class MmaMXF4NVF4Trait(BlockScaledMmaTraits):
+    pass
+####################################################################################################
+#
+# SMEM layout atoms
+#
+####################################################################################################
+class SmemLayoutAtomKind(enum.Enum):
+    """
+    Enum class for the kinds of SMEM layout atoms for SM100.
+    Given a swizzle kind, an SMEM layout atom is the compact layout of smallest size that can be
+    used to construct an SMEM layout using blocked product for operand A or B such that the
+    resulting layout is legal for both TMA and UMMA.
+    Note that there are other ways of creating legal layouts for operand A and B.
+    """
+    MN_INTER = enum.auto()
+    MN_SW32 = enum.auto()
+    MN_SW64 = enum.auto()
+    MN_SW128 = enum.auto()
+    MN_SW128_32B = enum.auto()
+    K_INTER = enum.auto()
+    K_SW32 = enum.auto()
+    K_SW64 = enum.auto()
+    K_SW128 = enum.auto()

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from .copy import *
+from .mma import *
+# __all__ is required here for documentation generation
+__all__ = [
+    # mma.py
+    "MmaF16BF16Op",
+    # copy.py
+    "LdMatrix8x8x16bOp",
+    "LdMatrix16x16x8bOp",
+    "StMatrix8x8x16bOp",
+    "StMatrix16x8x8bOp",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from dataclasses import dataclass
+from typing import Type
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+from ..common import OpError
+from ...core import CopyOp, Trait, _pack_shape
+from ...typing import Numeric
+@dataclass(frozen=True)
+class BaseOp(CopyOp):
+    transpose: bool = False
+    num_matrices: int = 1
+    def __post_init__(self) -> None:
+        if not isinstance(self.transpose, bool):
+            raise OpError(
+                self,
+                "expects the 'transpose' Op parameter to be a bool instance",
+            )
+    def __str__(self) -> str:
+        res = (
+            f"{self.__class__.__name__[:-2]} Copy Operation"
+            + f"\n  number of matrices = {self.num_matrices}"
+        )
+        if self.transpose:
+            res += f"\n  transposed"
+        return res
+@dataclass(frozen=True)
+class LdMatrix8x8x16bOp(BaseOp):
+    """
+    8x8 ``ldmatrix`` Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-load-instruction-ldmatrix>`__.
+    This operation corresponds to the ``.m8n8`` qualifier.
+    """
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.num_matrices not in [1, 2, 4]:
+            raise OpError(
+                self,
+                "expects the 'num_matrices' Op parameter to be one of [1,2,4]",
+            )
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "LdMatrix8x8x16bTrait":
+        mode = _pack_shape((8, 8), loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.CopyAtomLdsmType.get(
+            copy_internal_type.mlir_type,
+            mode.type.attribute,
+            _cute_nvgpu_ir.LdsmSzPattern.u16,
+            self.num_matrices,
+            ir.UnitAttr.get() if self.transpose else None,
+        )
+        return LdMatrix8x8x16bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class LdMatrix8x8x16bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class LdMatrix16x16x8bOp(BaseOp):
+    """
+    16x16 8-bit ``ldmatrix`` Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-load-instruction-ldmatrix>`__.
+    This operation corresponds to the ``.m16n16`` and the ``.b16`` qualifiers.
+    """
+    def __init__(self, num_matrices: int) -> None:
+        super().__init__(transpose=True, num_matrices=num_matrices)
+        self._verify()
+    def _verify(self):
+        assert self.transpose, "transpose must be True"
+        if self.num_matrices not in [1, 2]:
+            raise OpError(
+                self,
+                "expects the 'num_matrices' Op parameter to be one of [1,2]",
+            )
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "LdMatrix16x16x8bTrait":
+        mode = _pack_shape((16, 16), loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.CopyAtomLdsmType.get(
+            copy_internal_type.mlir_type,
+            mode.type.attribute,
+            _cute_nvgpu_ir.LdsmSzPattern.u8,
+            self.num_matrices,
+            ir.UnitAttr.get(),
+        )
+        return LdMatrix16x16x8bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class LdMatrix16x16x8bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class StMatrix8x8x16bOp(BaseOp):
+    """
+    8x8 ``stmatrix`` Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-stmatrix>`__.
+    This operation corresponds to the ``m8n8`` qualifier.
+    """
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.num_matrices not in [1, 2, 4]:
+            raise OpError(
+                self,
+                "expects the 'num_matrices' Op parameter to be one of [1,2,4]",
+            )
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "StMatrix8x8x16bTrait":
+        mode = _pack_shape((8, 8), loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.CopyAtomStsmType.get(
+            copy_internal_type.mlir_type,
+            mode.type.attribute,
+            self.num_matrices,
+            ir.UnitAttr.get() if self.transpose else None,
+        )
+        return StMatrix8x8x16bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class StMatrix8x8x16bTrait(Trait):
+    pass
+@dataclass(frozen=True)
+class StMatrix16x8x8bOp(BaseOp):
+    """
+    16x8 ``stmatrix`` Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-stmatrix>`__.
+    This operation corresponds to the ``m16n8`` qualifier.
+    """
+    def __init__(self, num_matrices: int) -> None:
+        super().__init__(transpose=True, num_matrices=num_matrices)
+        self._verify()
+    def _verify(self):
+        if self.num_matrices not in [1, 2, 4]:
+            assert self.transpose, "transpose must be True"
+            raise OpError(
+                self,
+                "expects the 'num_matrices' Op parameter to be one of [1,2,4]",
+            )
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "StMatrix16x8x8bTrait":
+        mode = _pack_shape((16, 8), loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.CopyAtomStsmType.get(
+            copy_internal_type.mlir_type,
+            mode.type.attribute,
+            self.num_matrices,
+            ir.UnitAttr.get(),
+        )
+        return StMatrix16x8x8bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+class StMatrix16x8x8bTrait(Trait):
+    pass

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from dataclasses import dataclass
+from typing import Type
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from ..common import OpError
+from ...core import MmaOp, Trait, _pack_shape, _Tensor
+from ...typing import Shape, Float16, BFloat16, Float32, Numeric, AddressSpace
+@dataclass(frozen=True)
+class MmaF16BF16Op(MmaOp):
+    """
+    F16/BF16 tcgen05 MMA Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-mma>`__.
+    This Operation covers the instructions using the ``.f16`` or ``.bf16`` qualifiers for the input operands.
+    """
+    ab_dtype: Type[Numeric]
+    acc_dtype: Type[Numeric]
+    shape_mnk: Shape
+    def __post_init__(self) -> None:
+        if self.ab_dtype not in [Float16, BFloat16]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float16 or BFloat16",
+            )
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        if (self.ab_dtype == BFloat16) and (self.acc_dtype != Float32):
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be Float32 when 'ab_dtype' is BFloat16",
+            )
+        if self.shape_mnk not in [(16, 8, 8), (16, 8, 16)]:
+            raise OpError(
+                self,
+                "expects the 'shape_mnk' Op parameter to be one of (16,8,8) or (16,8,16)",
+            )
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF16BF16Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM80Type.get(
+            shape_mnk.type.attribute,
+            self.ab_dtype.mlir_type,
+            self.ab_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+        )
+        return MmaF16BF16Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+    def __str__(self) -> str:
+        return (
+            "warp-level F16/BF16 MMA Operation"
+            + f"\n  A/B data type         = {self.ab_dtype}"
+            + f"\n  Accumulator data type = {self.acc_dtype}"
+            + f"\n  Instruction shape MNK = {self.shape_mnk}"
+        )
+    def _verify_fragment_A(self, input: _Tensor, *, loc=None, ip=None):
+        pass
+    def _verify_fragment_B(self, input: _Tensor, *, loc=None, ip=None):
+        pass
+class MmaF16BF16Trait(Trait):
+    pass

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from .mma import *
+from .helpers import *
+# __all__ is required here for documentation generation
+__all__ = [
+    # mma.py
+    "OperandMajorMode",
+    "OperandSource",
+    "Field",
+    "MmaF16BF16Op",
+    "MmaF8Op",
+    "SmemLayoutAtomKind",
+    # helpers.py
+    "make_smem_layout_atom",
+    "fence",
+    "commit_group",
+    "wait_group",
+]

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from typing import Type
+from cutlass.cutlass_dsl import dsl_user_op
+from cutlass._mlir.dialects import nvvm
+from ...typing import Numeric, NumericMeta
+from ... import core
+from .mma import SmemLayoutAtomKind
+@dsl_user_op
+def make_smem_layout_atom(
+    kind: SmemLayoutAtomKind, element_type: Type[Numeric], *, loc=None, ip=None
+) -> core.ComposedLayout:
+    """
+    Makes a SMEM layout Atom.
+    This function creates a composed layout in unit of elements consistent with the requested layout
+    Atom kind and element data type.
+    :param kind:         The kind of layout Atom
+    :type kind:          SmemLayoutAtomKind
+    :param element_type: The element data type to construct the layout for
+    :type element_type:  Type[Numeric]
+    :return:             The SMEM layout atom
+    :rtype:              core.ComposedLayout
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(f"element_type must be a Numeric, but got {element_type}")
+    if kind in (SmemLayoutAtomKind.MN_INTER, SmemLayoutAtomKind.K_INTER):
+        num_contiguous_bits = 128
+        sw = core.make_swizzle(0, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW32, SmemLayoutAtomKind.K_SW32):
+        num_contiguous_bits = 256
+        sw = core.make_swizzle(1, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW64, SmemLayoutAtomKind.K_SW64):
+        num_contiguous_bits = 512
+        sw = core.make_swizzle(2, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW128, SmemLayoutAtomKind.K_SW128):
+        num_contiguous_bits = 1024
+        sw = core.make_swizzle(3, 4, 3)
+    else:
+        raise ValueError("unrecognized SMEM layout atom kind")
+    num_contiguous_elems = num_contiguous_bits // element_type.width
+    if kind in (
+        SmemLayoutAtomKind.MN_INTER,
+        SmemLayoutAtomKind.MN_SW32,
+        SmemLayoutAtomKind.MN_SW64,
+        SmemLayoutAtomKind.MN_SW128,
+    ):
+        # M/N-major layout
+        return core.make_composed_layout(
+            sw,
+            0,
+            core.make_layout(
+                (num_contiguous_elems, 8), stride=(1, num_contiguous_elems)
+            ),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        # K-major layout
+        return core.make_composed_layout(
+            sw,
+            0,
+            core.make_layout(
+                (8, num_contiguous_elems), stride=(num_contiguous_elems, 1)
+            ),
+            loc=loc,
+            ip=ip,
+        )
+@dsl_user_op
+def fence(*, loc=None, ip=None) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-multiply-and-accumulate-instruction-wgmma-fence>`__.
+    """
+    nvvm.wgmma_fence_aligned(loc=None, ip=None)
+@dsl_user_op
+def commit_group(*, loc=None, ip=None) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group>`__.
+    """
+    nvvm.wgmma_commit_group_sync_aligned(loc=loc, ip=ip)
+@dsl_user_op
+def wait_group(group, *, loc=None, ip=None) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group>`__.
+    """
+    nvvm.wgmma_wait_group_sync_aligned(group, loc=loc, ip=ip)

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py ADDED Viewed

	@@ -0,0 +1,405 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+import enum
+from dataclasses import dataclass
+from typing import Type
+from cutlass.cutlass_dsl import CuTeDSL
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+from ..common import OpError
+from ...core import MmaOp, Trait, _pack_shape, rank, depth, _Tensor
+from ...typing import (
+    Shape,
+    Float16,
+    BFloat16,
+    Float32,
+    Boolean,
+    Float8E5M2,
+    Float8E4M3FN,
+    Numeric,
+    AddressSpace,
+)
+####################################################################################################
+#
+# MMA Ops and Traits
+#
+####################################################################################################
+class OperandMajorMode(enum.Enum):
+    """
+    An enumeration for the majorness of the input operands of the MMA.
+    """
+    MN = _cute_ir.MajorMode.mn
+    K = _cute_ir.MajorMode.k
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+    @classmethod
+    def _missing_(cls, value):
+        if isinstance(value, str):
+            value = value.upper()
+            if value == "MN":
+                return OperandMajorMode.MN
+            elif value == "K":
+                return OperandMajorMode.K
+    def _to_ir(self) -> _cute_ir.MajorMode:
+        return self.value
+class OperandSource(enum.Enum):
+    """
+    An enumeration for the source memory location of the A input operand of the MMA.
+    """
+    RMEM = _cute_ir.MmaFragKind.rmem
+    SMEM = _cute_ir.MmaFragKind.smem_desc
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+    def _to_ir(self) -> _cute_ir.MmaFragKind:
+        return self.value
+class Field(enum.Enum):
+    """
+    An enumeration for the fields of the MMA Atom that can be modified at runtime.
+    """
+    ACCUMULATE = "accum_c"
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+    def _to_ir_field_name(self) -> str:
+        return self.value
+@dataclass(frozen=True)
+class MmaOp(MmaOp):
+    a_dtype: Type[Numeric]
+    b_dtype: Type[Numeric]
+    acc_dtype: Type[Numeric]
+    shape_mnk: Shape
+    a_src: OperandSource
+    a_major_mode: OperandMajorMode
+    b_major_mode: OperandMajorMode
+    admissible_archs = ["sm_90a"]
+    def __post_init__(self) -> None:
+        # Verify arch
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.a_src, OperandSource):
+            raise OpError(
+                self,
+                "expects the 'a_src' Op parameter to be a warpgroup.OperandSource instance",
+            )
+        if not isinstance(self.a_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'a_major_mode' Op parameter to be a warpgroup.OperandMajorMode instance",
+            )
+        if not isinstance(self.b_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'b_major_mode' Op parameter to be a warpgroup.OperandMajorMode instance",
+            )
+        # Verify instruction shape
+        if (rank(self.shape_mnk) not in [2, 3]) or (depth(self.shape_mnk) != 1):
+            raise OpError(
+                self,
+                f"expected a flat rank 2 or 3 tuple for the 'shape_mnk' Op parameter, "
+                f"but got {self.shape_mnk}",
+            )
+        m, n = self.shape_mnk[0], self.shape_mnk[1]
+        if m != 64:
+            raise OpError(self, f"expects the M-mode to be 64, but got {m}")
+        if (n < 8) or (n > 256) or (n % 8 != 0):
+            raise OpError(
+                self,
+                f"expects the N-mode to satisfy 8 <= N <= 256 and N % 8 == 0. but got {n}",
+            )
+    def __str__(self) -> str:
+        return (
+            self.__class__.descriptive_name  # type: ignore
+            + f"\n  A data type           = {self.a_dtype}"
+            + f"\n  B data type           = {self.b_dtype}"
+            + f"\n  Accumulator data type = {self.acc_dtype}"
+            + f"\n  A source location     = {self.a_src}"
+            + f"\n  A major mode          = {self.a_major_mode}"
+            + f"\n  B major mode          = {self.b_major_mode}"
+            + f"\n  Instruction shape MNK = {self.shape_mnk}"
+        )
+    def _verify_fragment_A(self, input: _Tensor, *, loc=None, ip=None):
+        if input.memspace == AddressSpace.smem and isinstance(
+            input.layout.type, _cute_ir.ComposedLayoutType
+        ):
+            raise OpError(
+                self,
+                f"Expected affine layout for {self._make_trait()}'s operand A, "
+                f"but got composed layout instead: {input.layout}"
+                f"\nPlease use recast_ptr(ptr, {input.layout.inner}, element_type) operation to move swizzle to the ptr",
+            )
+        return True
+    def _verify_fragment_B(self, input: _Tensor, *, loc=None, ip=None):
+        if input.memspace == AddressSpace.smem and isinstance(
+            input.layout.type, _cute_ir.ComposedLayoutType
+        ):
+            raise OpError(
+                self,
+                f"Expected affine layout for {self._make_trait()}'s operand B, "
+                f"but got composed layout instead: {input.layout}"
+                f"\nPlease use recast_ptr(ptr, {input.layout.inner}, element_type) operation to move swizzle to the ptr",
+            )
+        return True
+class MmaTrait(Trait):
+    admissible_fields = [Field.ACCUMULATE]
+    def set(self, field, value, *, loc=None, ip=None) -> None:
+        if field not in self.admissible_fields:
+            raise ValueError(
+                f"invalid field, must be {Field.ACCUMULATE}, but got {field}"
+            )
+        field_name = f"#cute_nvgpu.atom_mma_field_sm90<{field._to_ir_field_name()}>"
+        attr = ir.Attribute.parse(field_name)
+        self.value = _cute_nvgpu_ir.atom_set_value(
+            self.value, attr, Boolean(value).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+@dataclass(frozen=True)
+class MmaF16BF16Op(MmaOp):
+    """
+    F16/BF16 warpgroup MMA Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async>`__.
+    This Operation covers the instructions using the ``.f16`` or ``.bf16`` qualifiers for the input operands.
+    """
+    descriptive_name = "warpgroup F16/BF16 MMA Operation"
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        acc_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            acc_dtype,
+            instruction_shape,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Float16, BFloat16]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float16 or BFloat16",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Accumulator data type verification
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        if (self.a_dtype == BFloat16) and (self.acc_dtype != Float32):
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be Float32 when 'ab_dtype' is BFloat16",
+            )
+        # Verify the instruction shape
+        instruction_k = 16
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF16BF16Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM90Type.get(
+            shape_mnk.type.attribute,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+        )
+        return MmaF16BF16Trait(
+            _cute_nvgpu_ir.make_sm90_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+class MmaF16BF16Trait(MmaTrait):
+    pass
+@dataclass(frozen=True)
+class MmaF8Op(MmaOp):
+    """
+    F16/BF16 warpgroup MMA Operation.
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async>`__.
+    This Operation covers the instructions using the ``.e4m3`` or ``.e5m2`` qualifiers for the input operands.
+    """
+    descriptive_name = "warpgroup F8 MMA Operation"
+    def __init__(
+        self,
+        a_dtype: Type[Numeric],
+        b_dtype: Type[Numeric],
+        acc_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            a_dtype,
+            b_dtype,
+            acc_dtype,
+            instruction_shape,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+    def _verify(self):
+        # Input data type verification
+        if self.a_dtype not in [Float8E5M2, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'a_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN",
+            )
+        if self.b_dtype not in [Float8E5M2, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'b_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN",
+            )
+        # Accumulator data type verification
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        # Verify the instruction shape
+        instruction_k = 32
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM90Type.get(
+            shape_mnk.type.attribute,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+        )
+        return MmaF8Trait(
+            _cute_nvgpu_ir.make_sm90_mma(
+                ty, Boolean(False).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+            )
+        )
+class MmaF8Trait(MmaTrait):
+    pass
+####################################################################################################
+#
+# SMEM layout atoms
+#
+####################################################################################################
+class SmemLayoutAtomKind(enum.Enum):
+    """
+    Enum class for the kinds of SMEM layout atoms for SM90.
+    Given a swizzle kind, an SMEM layout atom is the compact layout of smallest size that can
+    be used to construct an SMEM layout using blocked product for operand A or B such that the
+    resulting layout is legal for both TMA and UMMA.
+    Note that there are other ways of creating legal layouts for operand A and B.
+    """
+    MN_INTER = enum.auto()
+    MN_SW32 = enum.auto()
+    MN_SW64 = enum.auto()
+    MN_SW128 = enum.auto()
+    K_INTER = enum.auto()
+    K_SW32 = enum.auto()
+    K_SW64 = enum.auto()
+    K_SW128 = enum.auto()

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/runtime.py ADDED Viewed

	@@ -0,0 +1,510 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+import ctypes
+from functools import lru_cache
+import itertools
+import operator
+from time import time
+from typing import Union
+# MLIR modules imports
+from cutlass._mlir import ir
+import cutlass._mlir.dialects.cute as _cute_ir
+from cutlass.base_dsl.dsl import is_dynamic_expression
+from cutlass.cutlass_dsl import JitArgAdapterRegistry
+# Local modules imports
+from .typing import (
+    AddressSpace,
+    Tensor,
+    Type,
+    Pointer,
+    Boolean,
+    Numeric,
+    Float4E2M1FN,
+    Int64,
+    Int32,
+    Int16,
+    Int8,
+    Uint64,
+    Uint32,
+    Uint16,
+    Uint8,
+    Float64,
+    Float32,
+    Float16,
+    BFloat16,
+    Float8E5M2,
+)
+from . import core
+from .core import _Tensor as CoreTensor
+class _Pointer(Pointer):
+    """Runtime representation of a pointer that can inter-operate with various data structures,
+    including numpy arrays and device memory.
+    :param pointer: The pointer to the data
+    :type pointer: int or pointer-like object
+    :param dtype: Data type of the elements pointed to
+    :type dtype: Type
+    :param mem_space: Memory space where the pointer resides, defaults to generic
+    :type mem_space: _cute_ir.AddressSpace, optional
+    :param assumed_align: Assumed alignment of input pointer in bytes, defaults to None
+    :type assumed_align: int, optional
+    :ivar _pointer: The underlying pointer
+    :ivar _dtype: Data type of the elements
+    :ivar _addr_space: Memory space of the pointer
+    :ivar _assumed_align: Alignment of the pointer in bytes
+    :ivar _desc: C-type descriptor for the pointer
+    :ivar _c_pointer: C-compatible pointer representation
+    """
+    def __init__(
+        self,
+        pointer,
+        dtype,
+        mem_space: _cute_ir.AddressSpace = _cute_ir.AddressSpace.generic,
+        assumed_align=None,
+    ):
+        self._pointer = pointer
+        self._dtype = dtype
+        self._addr_space = mem_space
+        if assumed_align is None:
+            self._assumed_align = dtype.width // 8
+        else:
+            self._assumed_align = assumed_align
+        self._c_pointer = None
+        assert (
+            int(self._pointer) % self._assumed_align == 0
+        ), f"pointer must be {self._assumed_align} bytes aligned"
+    def size_in_bytes(self) -> int:
+        self._desc = ctypes.c_void_p(int(self._pointer))
+        return ctypes.sizeof(self._desc)
+    def __get_mlir_types__(self):
+        return [self.mlir_type]
+    def __c_pointers__(self):
+        if self._c_pointer is None:
+            self._desc = ctypes.c_void_p(int(self._pointer))
+            self._c_pointer = ctypes.addressof(self._desc)
+        return [self._c_pointer]
+    def __new_from_mlir_values__(self, values):
+        assert len(values) == 1
+        return values[0]
+    def __extract_mlir_values__(self):
+        return [self._c_pointer]
+    # Move mlir Type out of __init__ to decouple with mlir Context
+    @property
+    def mlir_type(self) -> ir.Type:
+        return _cute_ir.PtrType.get(
+            self._dtype.mlir_type, self._addr_space, self._assumed_align
+        )
+    @property
+    def dtype(self) -> Type[Numeric]:
+        return self._dtype
+    @property
+    def memspace(self):
+        return self._addr_space
+    def align(self, min_align: int, *, loc=None, ip=None) -> Pointer:
+        raise NotImplementedError("align is not supported in runtime")
+    def verify(self, expected_py_type):
+        if expected_py_type is Pointer:
+            return True
+        elif isinstance(expected_py_type, ir.Value) and expected_py_type.ty is Pointer:
+            return True
+        return False
+    def __str__(self) -> str:
+        return f"Ptr<0x{int(self._pointer):016x}@{self._addr_space}>"
+    def __repr__(self):
+        return self.__str__()
+class _Tensor(Tensor):
+    def __init__(
+        self,
+        tensor,
+        assumed_align=None,
+    ):
+        # If tensor is already a DLPack object, use it directly
+        if hasattr(tensor, "__dlpack_device__") and not hasattr(tensor, "__dlpack__"):
+            self._dlpack_data = tensor
+        else:
+            self._dlpack_data = tensor.__dlpack__()
+        self._dltensor_wrapper = None
+        self._assumed_align = assumed_align
+        self._is_dynamic = False
+        self._memref_desc = None
+        self._dtype = None
+    @property
+    def __class__(self) -> Type[Tensor]:
+        # Cheat to let `type(_Tensor())` to return cute.Tensor
+        return Tensor
+    @staticmethod
+    def lazily_load_dltensor(func):
+        """Decorator to lazily load the DLTensorWrapper.
+        This decorator loads the DLTensorWrapper when needed,
+        avoiding overhead in the critical path of calling JIT functions.
+        """
+        def wrapper(self, *args, **kwargs):
+            if self._dltensor_wrapper is None:
+                self._dltensor_wrapper = _cute_ir.DLTensorWrapper(self._dlpack_data)
+            return func(self, *args, **kwargs)
+        return wrapper
+    @lazily_load_dltensor
+    def mark_layout_dynamic(self, leading_dim: int | None = None):
+        """Marks the tensor layout as dynamic based on the leading dimension.
+        :param leading_dim: The leading dimension of the layout, defaults to None
+        :type leading_dim: int, optional
+        When ``leading_dim`` is None, automatically deduces the leading dimension from the tensor layout.
+        The layout can be deduced only when exactly one dimension has a stride of 1. Raises an error
+        if the layout cannot be automatically deduced.
+        When ``leading_dim`` is explicitly specified, marks the layout as dynamic while setting the
+        stride at ``leading_dim`` to 1. Also validates that the specified ``leading_dim`` is consistent
+        with the existing layout by checking that the corresponding stride of that dimension is 1.
+        Limitation: only support flat layout for now. Will work on supporting nested layout in the future.
+        :return: The tensor with dynamic layout
+        :rtype: _Tensor
+        """
+        self._dltensor_wrapper.mark_layout_dynamic(leading_dim)
+        return self
+    @lazily_load_dltensor
+    def mark_compact_shape_dynamic(
+        self,
+        mode: int,
+        stride_order: tuple[int, ...] | None = None,
+        divisibility: int = 1,
+    ):
+        """Marks the tensor shape as dynamic and propagates dynamic and divisibility information to the corresponding strides.
+        :param mode: The mode of the compact shape, defaults to 0
+        :type mode: int
+        :param stride_order: Consistent with `torch.Tensor.dim_order`. Defaults to None.
+        Indicates the order of the modes (dimensions) if the current layout were converted to row-major order.
+        It starts from the outermost to the innermost dimension.
+        :type stride_order: tuple[int, ...], optional
+        :param divisibility: The divisibility constraint for the compact shape, defaults to 1
+        :type divisibility: int, optional
+        :return: The tensor with dynamic compact shape
+        :rtype: _Tensor
+        If ``stride_order`` is not provided, the stride ordering will be automatically deduced from the layout.
+        Automatic deduction is only possible when exactly one dimension has a stride of 1 (compact layout).
+        An error is raised if automatic deduction fails.
+        If ``stride_order`` is explicitly specified, it does the consistency check with the layout.
+        For example:
+        - Layout: (4,2):(1,4) has stride_order: (1,0) indicates the innermost dimension is 0(`4:1`), the outermost dimension is 1(`2:4`)
+        - Layout: (5,3,2,4):(3,1,15,30) has stride_order: (3,2,0,1) indicates the innermost dimension is 1(`3:1`), the outermost dimension is 3(`4:30`).
+        Using `torch.Tensor.dim_order()` to get the stride order of the torch tensor.
+        .. code-block:: python
+            a = torch.empty(3, 4)
+            t = cute.runtime.from_dlpack(a)
+            t = t.mark_compact_shape_dynamic(mode=0, stride_order=a.dim_order())
+        """
+        self._dltensor_wrapper.mark_compact_shape_dynamic(
+            mode, stride_order, divisibility
+        )
+        return self
+    @property
+    @lazily_load_dltensor
+    def element_type(self) -> Type[Numeric]:
+        if self._dtype is None:
+            self._dtype = self._dltensor_wrapper.dtype
+        return self._dtype
+    @element_type.setter
+    def element_type(self, new_type):
+        """Set the element type of the tensor.
+        :warning: This API is added for narrow precision before we have a clean `recast_tensor` story.
+        :note: It is only used for the case that frameworks don't natively support narrow precision but we get tensor
+              from frameworks with storage type like uint8.
+        **Example**:
+        .. code-block:: python
+            # Create a tensor from a numpy array
+            import numpy as np
+            from cutlass.cute import from_dlpack
+            # Create a tensor with Float32 elements
+            a = np.zeros(shape, dtype=np.uint8)
+            tensor = from_dlpack(a)
+            # Change the element type to Float4E2M1FN even storage type is uint8
+            tensor.element_type = cutlass.Float4E2M1FN
+            src = from_dlpack(... data tensor ...)
+            # convert and initialize narrow precision tensor
+            cute.testing.convert(src, tensor)
+        """
+        self._dtype = new_type
+    @property
+    @lazily_load_dltensor
+    def memspace(self):
+        return self._dltensor_wrapper.address_space
+    @property
+    @lazily_load_dltensor
+    def size_in_bytes(self) -> int:
+        return self._dltensor_wrapper.size_in_bytes()
+    @property
+    @lazily_load_dltensor
+    def mlir_type(self) -> ir.Type:
+        return self._dltensor_wrapper.get_type(
+            self.element_type.mlir_type, self._assumed_align
+        )
+    @lazily_load_dltensor
+    def __str__(self) -> str:
+        return f"Tensor<0x{self._dltensor_wrapper.str}>"
+    def __repr__(self):
+        return self.__str__()
+    def __setitem__(self, crd, value):
+        raise TypeError(f"runtime._Tensor is not indexable")
+    def __getitem__(self, crd):
+        raise TypeError(f"runtime._Tensor is not indexable")
+    @property
+    @lazily_load_dltensor
+    def iterator(self):
+        return _Pointer(
+            self._dltensor_wrapper.data_ptr,
+            self.element_type,
+            self.memspace,
+            self._assumed_align,
+        )
+    @property
+    def layout(self):
+        raise NotImplementedError(
+            f"layout property is not supported in runtime, support in future"
+        )
+    @property
+    @lazily_load_dltensor
+    def shape(self):
+        return self._dltensor_wrapper.shape
+    @property
+    @lazily_load_dltensor
+    def stride(self):
+        strides = self._dltensor_wrapper.stride
+        if strides is None:
+            strides = itertools.accumulate(
+                reversed(self.shape), func=operator.mul, initial=1
+            )
+            strides = tuple(reversed(list(strides)[:-1]))
+        return strides
+    @property
+    @lru_cache(maxsize=128, typed=True)
+    def leading_dim(self):
+        """Get the leading dimension of this Tensor.
+        :return: The leading dimension index or indices
+        :rtype: int or tuple or None
+        The return value depends on the tensor's stride pattern:
+        * If a single leading dimension is found, returns an integer index
+        * If nested leading dimensions are found, returns a tuple of indices
+        * If no leading dimension is found, returns None
+        """
+        return core.leading_dim(self.shape, self.stride)
+    def fill(self, value: Numeric):
+        raise TypeError(f"fill function is not supported in runtime")
+    @property
+    @lazily_load_dltensor
+    def data_ptr(self):
+        return self._dltensor_wrapper.data_ptr
+    @lazily_load_dltensor
+    def __c_pointers__(self):
+        self._memref_desc = self._dltensor_wrapper.build_memref_desc(
+            self._assumed_align
+        )
+        return [_cute_ir.pycapsule_get_pointer(self._memref_desc)]
+    def __get_mlir_types__(self):
+        return [self.mlir_type]
+    def __new_from_mlir_values__(self, values):
+        assert len(values) == 1
+        assert isinstance(values[0], CoreTensor)
+        return CoreTensor(values[0].value, self._dtype)
+def from_dlpack(
+    tensor_dlpack,
+    assumed_align=None,
+) -> Tensor:
+    """Convert from tensor object supporting __dlpack__() to a CuTe Tensor.
+    :param tensor_dlpack: Tensor object that supports the DLPack protocol
+    :type tensor_dlpack: object
+    :param assumed_align: Assumed alignment of the tensor (bytes), defaults to None,
+      if None, will use the element size bytes as the assumed alignment.
+    :type assumed_align: int, optional
+    :return: A CuTe Tensor object
+    :rtype: Tensor
+    Examples:
+        .. code-block:: python
+            import torch
+            from cutlass.cute.runtime import from_dlpack
+            x = torch.randn(100, 100)
+            y = from_dlpack(x)
+            y.shape
+            # (100, 100)
+            type(y)
+            # <class 'cutlass.cute.Tensor'>
+    """
+    return _Tensor(
+        tensor_dlpack,
+        assumed_align=assumed_align,
+    )
+def make_ptr(
+    dtype: Type[Numeric],
+    value: Union[int, ctypes._Pointer],
+    mem_space: AddressSpace = AddressSpace.generic,
+    assumed_align=None,
+) -> Pointer:
+    """Create a pointer from a memory address
+    :param dtype: Data type of the pointer elements
+    :type dtype: Type[Numeric]
+    :param value: Memory address as integer or ctypes pointer
+    :type value: Union[int, ctypes._Pointer]
+    :param mem_space: Memory address space, defaults to AddressSpace.generic
+    :type mem_space: AddressSpace, optional
+    :param align_bytes: Alignment in bytes, defaults to None
+    :type align_bytes: int, optional
+    :return: A pointer object
+    :rtype: Pointer
+    .. code-block:: python
+        import numpy as np
+        import ctypes
+        from cutlass import Float32
+        from cutlass.cute.runtime import make_ptr
+        # Create a numpy array
+        a = np.random.randn(16, 32).astype(np.float32)
+        # Get pointer address as integer
+        ptr_address = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+        # Create pointer from address
+        y = make_ptr(cutlass.Float32, ptr_address)
+        # Check properties
+        print(y.element_type)
+        print(type(y))  # <class 'cutlass.cute.Pointer'>
+    """
+    # check if value is int or ctypes.POINTER
+    if isinstance(value, int):
+        address_value = value
+    elif isinstance(value, ctypes._Pointer):
+        # get address value
+        address_value = ctypes.cast(value, ctypes.c_void_p).value
+        assert address_value is not None, "Pointer address is None"
+    else:
+        raise TypeError(
+            f"Expect int or ctypes.POINTER for value but got {type(value)=}"
+        )
+    return _Pointer(address_value, dtype, mem_space, assumed_align=assumed_align)
+class TensorAdapter:
+    """
+    Convert a DLPack protocol supported tensor/array to a cute tensor.
+    """
+    def __init__(self, arg):
+        self._arg = from_dlpack(arg).mark_layout_dynamic()
+    def __new_from_mlir_values__(self, values):
+        return self._arg.__new_from_mlir_values__(values)
+    def __c_pointers__(self):
+        return self._arg.__c_pointers__()
+    def __get_mlir_types__(self):
+        return self._arg.__get_mlir_types__()
+# -------------------------------------------------------------------------
+# Try to register_jit_arg_adapter for TensorAdapter
+# -------------------------------------------------------------------------
+try:  # Register for numpy.ndarray
+    import numpy
+    JitArgAdapterRegistry.register_jit_arg_adapter(numpy.ndarray)(TensorAdapter)
+except ImportError:
+    pass  # silent attempt, suppress error
+try:  # Register for torch.Tensor
+    import torch
+    JitArgAdapterRegistry.register_jit_arg_adapter(torch.Tensor)(TensorAdapter)
+except ImportError:
+    pass  # silent attempt, suppress error

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/testing.py ADDED Viewed

	@@ -0,0 +1,610 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+import functools
+import inspect
+import logging
+import os
+from enum import Enum
+from inspect import isclass
+from itertools import product
+from time import time
+from typing import Any, Callable, Dict, List, Optional, Type, Union
+import cuda.bindings.driver as cuda_driver
+import cuda.bindings.runtime as cuda_runtime
+import numpy as np
+import cutlass._mlir.ir as ir
+import cutlass.base_dsl.jit_executor
+import cutlass.cute as cute
+from cutlass._mlir.dialects import builtin, cf, nvvm, vector
+from cutlass.cute import core, nvgpu
+from cutlass.cutlass_dsl import Constexpr, CuTeDSL, T, t, dsl_user_op
+@dsl_user_op
+def assert_(cond, msg=None, *, loc=None, ip=None):
+    cf.assert_(t.Boolean(cond).ir_value(), msg if msg else "", loc=loc, ip=ip)
+def _maybe_recast_tensor_from_f4(src: core.Tensor, tv_layout: core.Layout):
+    if src.element_type.width == 4:
+        tv_layout = core.recast_layout(8, 4, tv_layout)
+        src = core.recast_tensor(src, dtype=t.Int8)
+    return src, tv_layout
+def _maybe_recast_to_f4(input: core.TensorSSA, dtype: Type[core.Numeric]):
+    """Conditionally recasts the tensor to 4-bit type if the destination type is 4-bit.
+    :param input: The input tensor to recast.
+    :param dtype: The target numeric type to potentially recast to.
+    :raises TypeError: If dtype is not a subclass of Numeric.
+    :return: A new tensor recast to 4-bit if dtype is 4-bit, otherwise returns self unchanged.
+    """
+    if not isclass(dtype) or not issubclass(dtype, core.Numeric):
+        raise TypeError(f"dst_ty must be a type of Numeric, but got {dtype}")
+    if dtype.width == 4:
+        recast_shape = core.recast_layout(4, 8, core.make_layout(input.shape)).shape
+        i4_vec = vector.bitcast(
+            T.vector(input.type.shape[0] * 2, T.i(4)), input.maybe_downcast()
+        )
+        res_vect = builtin.unrealized_conversion_cast(
+            [T.vector(i4_vec.type.shape[0], dtype.mlir_type)], [i4_vec]
+        )
+        return core.TensorSSA(res_vect, recast_shape, dtype)
+    return input
+def _maybe_recast_from_f4(input: core.TensorSSA, src_dtype: Type[core.Numeric]):
+    """Conditionally recasts the tensor from 4-bit type if the source type is 4-bit.
+    :param input: The input tensor to recast.
+    :param src_dtype: The source numeric type to potentially recast from.
+    :raises TypeError: If src_dtype is not a subclass of Numeric.
+    :return: A new tensor recast from 4-bit if src_dtype is 4-bit, otherwise returns self unchanged.
+    """
+    if not isclass(src_dtype) or not issubclass(src_dtype, core.Numeric):
+        raise TypeError(f"src_ty must be a type of Numeric, but got {src_dtype}")
+    if src_dtype.width == 4:
+        recast_shape = core.recast_layout(8, 4, core.make_layout(input.shape)).shape
+        i4_vec = builtin.unrealized_conversion_cast(
+            [T.vector(input.type.shape[0], T.i(4))], [input.maybe_downcast()]
+        )
+        res_vect = vector.bitcast(T.vector(i4_vec.type.shape[0] // 2, T.i8()), i4_vec)
+        return core.TensorSSA(res_vect, recast_shape, core.Int8)
+    return input
+@CuTeDSL.kernel
+def _convert_kernel(
+    gSrc: core.Tensor,
+    gDst: core.Tensor,
+    cSrc: core.Tensor,
+    src_tv_layout: core.Layout,
+    dst_tv_layout: core.Layout,
+    src_shape: core.Shape,
+    src_ty,
+    dst_ty,
+):
+    tidx = nvvm.read_ptx_sreg_tid_x(T.i32())
+    bidx = nvvm.read_ptx_sreg_ctaid_x(T.i32())
+    cta_coord = (None, bidx)
+    # logical idx -> address
+    ctaSrc = gSrc[cta_coord]  # (...,TileV,...)
+    ctaDst = gDst[cta_coord]  # (...,TileV,...)
+    ctaCSrc = cSrc[cta_coord]  # (...,TileV,...)
+    # print(f"ctaSrc = {ctaSrc.type}")
+    # compose with CTA TV layout
+    # tid, vid -> address
+    tidfrgSrc = core.composition(ctaSrc, src_tv_layout)  # (T,V)
+    tidfrgDst = core.composition(ctaDst, dst_tv_layout)  # (T,V)
+    tidfrgCSrc = core.composition(ctaCSrc, src_tv_layout)  # (T,V)
+    # print(f"tidfrgSrc = {tidfrgSrc.type}")
+    # slice for threads
+    thr_coord = (tidx, None)
+    thrSrc = tidfrgSrc[thr_coord]  # (V)
+    thrDst = tidfrgDst[thr_coord]  # (V)
+    thrCSrc = tidfrgCSrc[thr_coord]  # (V)
+    # print(f"thrSrc = {thrSrc.type}")
+    # predicate
+    if core.elem_less(thrCSrc[0], src_shape):
+        # allocate fragments for gmem->rmem
+        frgSrc = core.make_fragment(
+            core.get(src_tv_layout, mode=[1]), gSrc.element_type
+        )  # (V)
+        frgDst = core.make_fragment(
+            core.get(dst_tv_layout, mode=[1]), gDst.element_type
+        )  # (V)
+        # print(f"frgSrc = {frgSrc.type}")
+        # Move data to reg address space
+        copy_atom_load = core.make_copy_atom(nvgpu.CopyUniversalOp(), gSrc.element_type)
+        core.copy(copy_atom_load, thrSrc, frgSrc)
+        vec_src = frgSrc.load()
+        vec_src = _maybe_recast_to_f4(vec_src, src_ty)
+        vec_dst = vec_src.to(dst_ty)
+        vec_dst = _maybe_recast_from_f4(vec_dst, dst_ty)
+        frgDst.store(vec_dst)
+        # Copy the results back to c
+        copy_atom_stg = core.make_copy_atom(nvgpu.CopyUniversalOp(), gDst.element_type)
+        core.copy(copy_atom_stg, frgDst, thrDst)
+@CuTeDSL.jit(preprocess=False)
+def _convert(
+    src: core.Tensor,
+    dst: core.Tensor,
+    leading_mode: Constexpr,
+    elem_per_copy: Constexpr,
+):
+    # Step 1. figure proper tv_layout
+    src_ty = src.element_type
+    dst_ty = dst.element_type
+    tv_layout = core.make_layout((128, elem_per_copy), stride=(elem_per_copy, 1))
+    # Step 2. maybe recast from f4 tensor
+    src, src_tv_layout = _maybe_recast_tensor_from_f4(src, tv_layout)
+    dst, dst_tv_layout = _maybe_recast_tensor_from_f4(dst, tv_layout)
+    src_shape = src.shape
+    # predicate tensor
+    idA = core.make_identity_tensor(src.shape)
+    # Step 3. select a proper tiling pattern as (...,TileV, ...)
+    src_cta_tiler = [
+        1,
+    ] * core.rank(src.layout)
+    src_cta_tiler[leading_mode] = core.size(src_tv_layout)  # (...,TileV,...)
+    dst_cta_tiler = [
+        1,
+    ] * core.rank(dst.layout)
+    dst_cta_tiler[leading_mode] = core.size(dst_tv_layout)  # (...,TileV,...)
+    # Step 4. partition input and output tensor by cta tiler.
+    gS = core.zipped_divide(
+        src, tuple(src_cta_tiler)
+    )  # ((...,TileV,...),(...,RestV,...))
+    cS = core.zipped_divide(
+        idA, tuple(src_cta_tiler)
+    )  # ((...,TileV,...),(...,RestV,...))
+    gD = core.zipped_divide(
+        dst, tuple(dst_cta_tiler)
+    )  # ((...,TileV,...),(...,RestV,...))
+    # print(f"{gS.type=}")
+    _convert_kernel(
+        gS,
+        gD,
+        cS,
+        src_tv_layout,
+        dst_tv_layout,
+        src_shape,
+        src_ty,
+        dst_ty,
+    ).launch(
+        grid=[core.size(gS, mode=[1]), 1, 1],
+        block=[core.size(src_tv_layout, mode=[0]), 1, 1],
+    )
+# Converts from src tensor to dst tensor, their logical shape are required to be the same.
+# And when src or dst dtype is narrow precision(Float4E2M1FN/Float8E8M0FNU/Float8E4M3FN), the shape of
+# their leading dimension should be 4(fp8)/8(fp4) element align. (nvgpu.cvt_fptrunc/cvt_fpext
+# needs 32-bits aligned input/output)
+def convert(src: core.Tensor, dst: core.Tensor):
+    assert len(src.shape) == len(
+        dst.shape
+    ), "Shape of src and dst tensors should be the same rank."
+    # find leading mode
+    leading_mode = [
+        idx
+        for idx, (shape, stride) in enumerate(zip(src.shape, src.stride))
+        if shape > 1 and stride == 1
+    ]
+    if len(leading_mode) != 1:
+        raise ValueError(f"Leading mode should be unique, but got {leading_mode}")
+    leading_mode = leading_mode[0]
+    elem_per_copy = 2
+    if src.element_type.width == 4 or dst.element_type.width == 4:
+        elem_per_copy = 8
+    elif src.element_type.width == 8 or dst.element_type.width == 8:
+        elem_per_copy = 4
+    assert (
+        src.shape[leading_mode] % elem_per_copy == 0
+        and dst.shape[leading_mode] % elem_per_copy == 0
+    )
+    _convert(src, dst, leading_mode, elem_per_copy)
+#########################################
+# Testing utilities
+#########################################
+def sample_pytest(rand_cfg=None):
+    """
+    Decorator to randomly sample pytest parametrized tests.
+    rand_cfg: Tuple[int, float] - (random_seed, sample_ratio)
+    Sampling is disabled when:
+    - A specific test is selected (via -k or direct test path)
+    - Not running under pytest
+    """
+    import functools
+    import os
+    import random
+    import sys
+    import pytest
+    seed, sample_ratio = rand_cfg
+    random.seed(seed)
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if rand_cfg is not None and "PYTEST_CURRENT_TEST" in os.environ:
+                # Check if test was explicitly selected like ::test_name[param1-param2-...]
+                if "-k" in sys.argv or any(".py::" in arg for arg in sys.argv):
+                    # Test was explicitly selected, don't skip
+                    return func(*args, **kwargs)
+                if random.uniform(0.0, 1.0) > sample_ratio:
+                    pytest.skip(f"Randomly skipped (sampling ratio: {sample_ratio})")
+            return func(*args, **kwargs)
+        return wrapper
+    return decorator
+#########################################
+# Benchmarking utilities
+#########################################
+class JitArguments:
+    """
+    A type to hold both args and kwargs for passing to a kernel while benchmarking.
+    """
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+def _cuda_success(
+    err: Union[tuple, cuda_runtime.cudaError_t, cuda_driver.CUresult], message: str
+):
+    """
+    Helper function to check CUDA API errors.
+    """
+    if isinstance(err, tuple):
+        _cuda_success(err[0], message)
+    elif isinstance(err, cuda_runtime.cudaError_t):
+        error_message = cuda_runtime.cudaGetErrorString(err)[1].decode("utf-8")
+        if err != cuda_runtime.cudaError_t.cudaSuccess:
+            raise RuntimeError(f"{message} : {error_message}")
+    elif isinstance(err, cuda_driver.CUresult):
+        if err != cuda_driver.CUresult.CUDA_SUCCESS:
+            error_message = cuda_driver.cuGetErrorString(err)[1].decode("utf-8")
+            raise RuntimeError(f"{message} : {error_message}")
+    else:
+        raise TypeError(
+            f"{err} is an unexpected type : it should be a cudaError_t or CUresult"
+        )
+def _does_kernel_use_stream(
+    kernel: Callable, stream: cuda_driver.CUstream, *args, **kwargs
+):
+    """
+    This function checks if the kernel uses the provided non-default stream.
+    It does this by capturing the stream and then checking if any kernels were launched.
+    :param kernel: The kernel to check
+    :type kernel: Callable
+    :param stream: The stream to check
+    :type stream: cuda_driver.CUstream
+    :return: True if the kernel uses the stream, False otherwise
+    :rtype: bool
+    """
+    assert int(stream) != int(
+        cuda_driver.CUstream_flags.CU_STREAM_DEFAULT
+    ), "Stream must be a non-default stream"
+    err = cuda_runtime.cudaStreamBeginCapture(
+        stream, cuda_runtime.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal
+    )
+    _cuda_success(err, "Error on stream capture")
+    kernel(*args, **kwargs)
+    err, graph = cuda_runtime.cudaStreamEndCapture(stream)
+    _cuda_success(err, "Error on stream capture")
+    # Get number of nodes in warmup graph to check it matches what is expected
+    err, _, num_nodes = cuda_runtime.cudaGraphGetNodes(graph)
+    _cuda_success(err, "Error on querying graph")
+    return num_nodes > 0
+def benchmark(
+    callable: Callable,
+    *,
+    warmup_iterations: int = 10,
+    iterations: int = 100,
+    stream: Optional[cuda_driver.CUstream] = None,
+    kernel_arguments: Optional[JitArguments] = None,
+    workspace_generator: Optional[Callable[[], JitArguments]] = None,
+    workspace_count: int = 1,
+    use_cuda_graphs: bool = False,
+) -> float:
+    """Benchmarks a callable function with the specified parameters.
+    For example,
+    .. code-block:: python
+        from cutlass.cute.testing import benchmark
+        @cute.jit
+        def user_function(a: cute.Tensor, b: cute.Tensor, c: cute.Tensor, stream: cuda_driver.CUstream):
+            # contents of the function
+            pass
+        time_us = benchmark(user_function, kernel_arguments=JitArguments(a, b, c, stream)
+                            warmup_iterations=10, iterations=100
+                            stream=stream)
+    To prevent skewing results by repeately accessing the L2 cache, use the workspace_count and workspace_generator
+    parameters to cycle through a number of different workspaces.
+    .. code-block:: python
+        from cutlass.cute.testing import benchmark
+        @cute.jit
+        def user_function(a: cute.Tensor, b: cute.Tensor, c: cute.Tensor):
+            # contents of the function
+            pass
+        def workspace_generator():
+            # create a, b, and c
+            return JitArguments(a, b, c)
+        time_us = benchmark(user_function,
+                            workspace_generator=workspace_generator,
+                            workspace_count=10,
+                            warmup_iterations=10000,
+                            iterations=1000)
+    To benchmark you may always configure the function being profiled (callable), the warmup iterations, and
+    the number of profiling iterations.
+    Whenever the kernel being benchmarked runs in a non-default stream, the stream must be provided through the stream parameter.
+    To use CUDA graphs, the callable must be a compiled @cute.jit annotated function.
+    When using CUDA graphs, the kernel must be launched in a non-default stream.
+    :param callable: The function to benchmark
+    :type callable: Callable
+    :param warmup_iterations: Number of warmup iterations, defaults to 10
+    :type warmup_iterations: int, optional
+    :param iterations: Number of benchmark iterations, defaults to 100
+    :type iterations: int, optional
+    :param stream: Stream kernel is launched in, defaults to CUDA stream default
+    :type stream: CUstream, None
+    :param kernel_arguments: Kernel arguments to launch callable with, defaults to None
+    :type kernel_arguments: JitArguments, None
+    :param workspace_generator: Function that returns kernel arguments, defaults to None
+    :type workspace_generator: Callable
+    :param workspace_count: Number of workspaces (arguments) to loop through, looping through enough workspaces will keep the L2 cache cold
+    :type workspace_count: int, optional
+    :param use_cuda_graphs: Whether to use cuda graphs, defaults to False
+    :type use_cuda_graphs: bool, optional
+    :return: The benchmark time in microseconds
+    :rtype: float
+    """
+    if stream is None:
+        stream = cuda_driver.CUstream(cuda_driver.CUstream_flags.CU_STREAM_DEFAULT)
+    if workspace_count < 1:
+        raise ValueError("workspace_count must be at least 1")
+    time_us = float("nan")
+    if workspace_generator == None:
+        # If no workspace generator is provided, we need a single workspace
+        if workspace_count != 1:
+            raise ValueError("Need a single workspace if not providing a generator")
+        # If no workspace generator is provided, we need a kernel_argument
+        if kernel_arguments == None:
+            raise ValueError(
+                "Please pass a kernel argument if not providing a generator"
+            )
+        workspace_generator = lambda: kernel_arguments
+    workspaces = [workspace_generator() for _ in range(workspace_count)]
+    for workspace in workspaces:
+        if type(workspace) != JitArguments:
+            raise TypeError(
+                "workspace_generator and/or kernel_arguments should use JitArguments type"
+            )
+    def _loop_and_call_kernel(iterations: int, workspace_index: int = 0):
+        for _ in range(iterations):
+            current_workspace = workspaces[workspace_index]
+            callable(*current_workspace.args, **current_workspace.kwargs)
+            workspace_index = (workspace_index + 1) % workspace_count
+        return workspace_index
+    # Create CUDA events for timing
+    err, start_event = cuda_driver.cuEventCreate(
+        cuda_driver.CUevent_flags.CU_EVENT_DEFAULT
+    )
+    _cuda_success(err, "Error on creating event")
+    err, end_event = cuda_driver.cuEventCreate(
+        cuda_driver.CUevent_flags.CU_EVENT_DEFAULT
+    )
+    _cuda_success(err, "Error on creating event")
+    elapsed_time = float("nan")
+    if use_cuda_graphs:
+        # Check if the callable is a JitExecutor
+        if not isinstance(callable, cutlass.base_dsl.jit_executor.JitExecutor):
+            raise TypeError("Function must be precompiled to be used with CUDA Graphs")
+        # Check if the stream is a non-default stream
+        if int(stream) == int(cuda_driver.CUstream_flags.CU_STREAM_DEFAULT):
+            raise ValueError(
+                "Measuring with CUDA Graphs requires executing in a non-default stream"
+            )
+        workspace_index = 0
+        # Capture warmup graph
+        err = cuda_runtime.cudaStreamBeginCapture(
+            stream, cuda_runtime.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal
+        )
+        _cuda_success(err, "Error on stream capture")
+        workspace_index = _loop_and_call_kernel(warmup_iterations)
+        err, gwarm = cuda_runtime.cudaStreamEndCapture(stream)
+        _cuda_success(err, "Error on stream capture")
+        # Get number of nodes in warmup graph to check it matches what is expected
+        err, _, num_nodes = cuda_runtime.cudaGraphGetNodes(gwarm)
+        _cuda_success(err, "Error on querying graph")
+        # Assertion is >= since we may launch multiple kernels in one host function
+        if num_nodes < warmup_iterations:
+            raise ValueError(
+                f"CUDA stream passed to benchmark does not match the stream the kernel was launched in"
+            )
+        # Capture profiling graph
+        err = cuda_runtime.cudaStreamBeginCapture(
+            stream, cuda_runtime.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal
+        )
+        _cuda_success(err, "Error on stream capture")
+        _loop_and_call_kernel(iterations, workspace_index)
+        err, gprofile = cuda_runtime.cudaStreamEndCapture(stream)
+        _cuda_success(err, "Error on stream capture")
+        # Instantiate graphs
+        err, gwarm = cuda_runtime.cudaGraphInstantiate(gwarm, 0)
+        _cuda_success(err, "Error on graph instantiation")
+        err, gprofile = cuda_runtime.cudaGraphInstantiate(gprofile, 0)
+        _cuda_success(err, "Error on graph instantiation")
+        # Launch warmup graph
+        err = cuda_runtime.cudaGraphLaunch(gwarm, stream)
+        _cuda_success(err, "Error on graph launch")
+        # Record start time
+        err = cuda_driver.cuEventRecord(start_event, stream)
+        _cuda_success(err, "Error on recording event")
+        # Launch profiling graph
+        err = cuda_runtime.cudaGraphLaunch(gprofile, stream)
+        _cuda_success(err, "Error on graph launch")
+        # Record end time
+        err = cuda_driver.cuEventRecord(end_event, stream)
+        _cuda_success(err, "Error on recording event")
+        err = cuda_driver.cuEventSynchronize(end_event)
+        _cuda_success(err, "Error on synchronizing event")
+        # Get elapsed time
+        err, elapsed_time = cuda_driver.cuEventElapsedTime(start_event, end_event)
+        _cuda_success(err, "Error on querying event")
+        # Destroy graphs
+        err = cuda_runtime.cudaGraphExecDestroy(gwarm)
+        _cuda_success(err, "Error on destroying graph")
+        err = cuda_runtime.cudaGraphExecDestroy(gprofile)
+        _cuda_success(err, "Error on destroying graph")
+    else:
+        if int(stream) != int(
+            cuda_driver.CUstream_flags.CU_STREAM_DEFAULT
+        ) and not _does_kernel_use_stream(
+            callable, stream, *workspaces[0].args, **workspaces[0].kwargs
+        ):
+            raise ValueError(
+                "CUDA stream passed to benchmark does not match the stream the kernel was launched in"
+            )
+        # Not using graphs
+        # Warmup
+        workspace_index = _loop_and_call_kernel(warmup_iterations)
+        # Record start event
+        err = cuda_driver.cuEventRecord(start_event, stream)
+        _cuda_success(err, "Error on recording event")
+        _loop_and_call_kernel(iterations, workspace_index)
+        # Record end event
+        err = cuda_driver.cuEventRecord(end_event, stream)
+        _cuda_success(err, "Error on recording event")
+        # Synchronize end event
+        err = cuda_driver.cuEventSynchronize(end_event)
+        _cuda_success(err, "Error on synchronizing event")
+        err, elapsed_time = cuda_driver.cuEventElapsedTime(start_event, end_event)
+        _cuda_success(err, "Error on querying event")
+    # Destroy events
+    err = cuda_driver.cuEventDestroy(start_event)
+    _cuda_success(err, "Error on destroying event")
+    err = cuda_driver.cuEventDestroy(end_event)
+    _cuda_success(err, "Error on destroying event")
+    return elapsed_time / iterations * 1e3
+def get_workspace_count(
+    one_workspace_bytes: int, warmup_iterations: int, iterations: int
+) -> int:
+    """Calculate the number of workspaces needed to fill L2 cache.
+    :param one_workspace_bytes: Size of one workspace in bytes
+    :type one_workspace_bytes: int
+    :param warmup_iterations: Number of warmup iterations
+    :type warmup_iterations: int
+    :param iterations: Number of iterations
+    :type iterations: int
+    :return: Number of workspaces needed
+    :rtype: int
+    """
+    num_l2_cache_bytes = cutlass.utils.HardwareInfo().get_l2_cache_size_in_bytes()
+    return max(
+        1,
+        min(
+            warmup_iterations + iterations,  # Don't create more workspaces than needed
+            (num_l2_cache_bytes + one_workspace_bytes - 1)
+            // one_workspace_bytes,  # Ceiling division
+        ),
+    )

build/torch210-cxx11-cu126-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/typing.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from abc import ABC, abstractmethod
+from typing import ForwardRef, Tuple, Union, Any, Type, List
+from cutlass.base_dsl.typing import *
+from cutlass._mlir import ir
+import cutlass._mlir.extras.types as T
+from cutlass._mlir.dialects.cute import AddressSpace
+Int = Union[int, Integer]
+ScaledBasis = ForwardRef("ScaledBasis")
+IntTuple = Union[Int, Tuple["IntTuple", ...]]
+Shape = Union[Int, Tuple["Shape", ...]]
+Stride = Union[Int, ScaledBasis, Tuple["Stride", ...]]
+Coord = Union[Int, None, Tuple["Coord", ...]]
+class Layout(ir.Value):
+    def __init__(self, op_result):
+        super().__init__(op_result)
+    def __str__(self): ...
+    def get_hier_coord(self, idx) -> Coord:
+        """Return the (hierarchical) ND logical coordinate corresponding to the linear index"""
+        ...
+    @property
+    def shape(self, *, loc=None, ip=None) -> Shape: ...
+    @property
+    def stride(self, *, loc=None, ip=None) -> Stride: ...
+Tile = Union[Int, None, Layout, Tuple["Tile", ...]]
+# XTuple is super set of above types
+XTuple = Union[IntTuple, Shape, Stride, Coord, Tile]
+Tiler = Union[Shape, Layout, Tile]
+class Pointer(ABC):
+    """
+    Abstract base class for CuTe jit function and runtime _Pointer
+    """
+    @property
+    def value_type(self) -> Type[Numeric]:
+        return self.dtype
+    @property
+    def dtype(self) -> Type[Numeric]: ...
+    def align(self, min_align: int) -> "Pointer": ...
+    def __get_mlir_types__(self) -> List[ir.Type]: ...
+    def __extract_mlir_values__(self) -> List[ir.Value]: ...
+    def __new_from_mlir_values__(self, values) -> "Pointer": ...
+class Tensor(ABC):
+    """
+    Abstract base class for CuTe jit function and runtime _Tensor
+    A CuTe Tensor is iterator with layout
+    :Examples:
+    Create tensor from torch.tensor with Host Runtime:
+    .. code-block:: python
+        >>> import torch
+        >>> from cutlass.cute.runtime import from_dlpack
+        >>> mA = from_dlpack(torch.tensor([1, 3, 5], dtype=torch.int32))
+        >>> mA.shape
+        (3,)
+        >>> mA.stride
+        (1,)
+        >>> mA.layout
+        (3,):(1,)
+    Define JIT function:
+    .. code-block:: python
+        @cute.jit
+        def add(a: Tensor, b: Tensor, res: Tensor): ...
+    Call JIT function from python:
+    .. code-block:: python
+        >>> import torch
+        >>> a = torch.tensor([1, 3, 5], dtype=torch.int32)
+        >>> b = torch.tensor([2, 4, 6], dtype=torch.int32)
+        >>> c = torch.zeros([3], dtype=torch.int32)
+        >>> mA = from_dlpack(a)
+        >>> mB = from_dlpack(b)
+        >>> mC = from_dlpack(c)
+        >>> add(mA, mB, mC)
+        >>> c
+        tensor([3, 7, 11], dtype=torch.int32)
+    """
+    def __str__(self): ...
+    @abstractmethod
+    def __getitem__(self, idx) -> Union["Tensor", ir.Value, IntTuple]: ...
+    @abstractmethod
+    def __setitem__(self, idx, value): ...
+    @property
+    @abstractmethod
+    def element_type(self) -> Union[Type[Numeric], Type[IntTuple]]: ...
+    @element_type.setter
+    def element_type(self, new_type): ...
+    @property
+    @abstractmethod
+    def memspace(self) -> AddressSpace: ...
+    @property
+    @abstractmethod
+    def iterator(self): ...
+    @property
+    def layout(self) -> Union[Layout, "ComposedLayout"]: ...
+    @property
+    def shape(self) -> Shape: ...
+    def load(self, *, loc=None, ip=None) -> "TensorSSA": ...
+    def store(self, data: "TensorSSA", *, loc=None, ip=None): ...
+    def mark_layout_dynamic(self, leading_dim: int | None = None) -> "Tensor": ...
+    def mark_compact_shape_dynamic(
+        self,
+        mode: int,
+        stride_order: tuple[int, ...] | None = None,
+        divisibility: int = 1,
+    ) -> "Tensor": ...
+    @abstractmethod
+    def fill(self, value: Numeric) -> None: ...
+__all__ = [
+    "Coord",
+    "Numeric",
+    "Integer",
+    "Boolean",
+    "Int8",
+    "Int16",
+    "Int32",
+    "Int64",
+    "Uint8",
+    "Uint16",
+    "Uint32",
+    "Uint64",
+    "Float",
+    "Float16",
+    "BFloat16",
+    "TFloat32",
+    "Float32",
+    "Float64",
+    "Float8E5M2",
+    "Float8E4M3FN",
+    "Float8E4M3B11FNUZ",
+    "Float8E4M3",
+    "Float8E8M0FNU",
+    "Float4E2M1FN",
+    "Float6E2M3FN",
+    "Float6E3M2FN",
+    "IntTuple",
+    "Layout",
+    "Pointer",
+    "Shape",
+    "Stride",
+    "Tensor",
+    "Tile",
+    "Tiler",
+    "XTuple",
+]