danieldk HF Staff commited on Jan 8

Commit

f25859d

verified ·

1 Parent(s): d2d3257

Build uploaded using `kernels`.

Browse files

Files changed (30) hide show

build/torch-cuda/__init__.py +7 -0
build/torch-cuda/_ops.py +8 -0
build/torch-cuda/metadata.json +1 -0
build/torch-cuda/task.py +20 -0
build/torch-cuda/trimul_global.py +971 -0
build/torch-cuda/trimul_gpumode/__init__.py +26 -0
build/torch-cuda/trimul_mi300.py +524 -0
build/torch-cuda/triton_a100.py +405 -0
build/torch-cuda/triton_b200.py +411 -0
build/torch-cuda/triton_h100.py +509 -0
build/torch-rocm/__init__.py +7 -0
build/torch-rocm/_ops.py +8 -0
build/torch-rocm/metadata.json +1 -0
build/torch-rocm/task.py +20 -0
build/torch-rocm/trimul_global.py +971 -0
build/torch-rocm/trimul_gpumode/__init__.py +26 -0
build/torch-rocm/trimul_mi300.py +524 -0
build/torch-rocm/triton_a100.py +405 -0
build/torch-rocm/triton_b200.py +411 -0
build/torch-rocm/triton_h100.py +509 -0
build/torch-xpu/__init__.py +7 -0
build/torch-xpu/_ops.py +8 -0
build/torch-xpu/metadata.json +1 -0
build/torch-xpu/task.py +20 -0
build/torch-xpu/trimul_global.py +971 -0
build/torch-xpu/trimul_gpumode/__init__.py +26 -0
build/torch-xpu/trimul_mi300.py +524 -0
build/torch-xpu/triton_a100.py +405 -0
build/torch-xpu/triton_b200.py +411 -0
build/torch-xpu/triton_h100.py +509 -0

build/torch-cuda/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .triton_a100 import kernel_a100
+from .triton_h100 import kernel_h100
+from .triton_b200 import kernel_b200
+from .trimul_mi300 import kernel_mi300
+from .trimul_global import kernel_global
+__all__ = ["kernel_a100", "kernel_h100", "kernel_b200", "kernel_mi300", "kernel_global"]

build/torch-cuda/_ops.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+ops = torch.ops._trimul_gpumode_176b4e4
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_trimul_gpumode_176b4e4::{op_name}"

build/torch-cuda/metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"python-depends":[]}

build/torch-cuda/task.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""
+Type definitions for TriMul task.
+Input: Tuple of (input_tensor, mask, weights, config)
+  - input_tensor: Input tensor of shape [batch_size, seq_len, seq_len, dim]
+  - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
+  - weights: Dictionary containing model weights
+  - config: Dictionary containing model configuration parameters
+Output: Output tensor of shape [batch_size, seq_len, seq_len, dim]
+"""
+import torch
+from typing import Tuple, Dict, Any
+# Input type: (input_tensor, mask, weights, config)
+input_t = Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], Dict[str, Any]]
+# Output type: output tensor
+output_t = torch.Tensor

build/torch-cuda/trimul_global.py ADDED Viewed

	@@ -0,0 +1,971 @@

+# from utils import make_match_reference, DisableCuDNNTF32
+from .task import input_t, output_t
+import torch
+from torch import nn, einsum
+import math
+import os
+import requests
+import triton
+import triton.language as tl
+# The flag below controls whether to allow TF32 on matmul. This flag defaults to False
+# in PyTorch 1.12 and later.
+torch.backends.cuda.matmul.allow_tf32 = True
+# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
+torch.backends.cudnn.allow_tf32 = True
+# Set allocator for TMA descriptors (required for on-device TMA)
+def alloc_fn(size: int, alignment: int, stream=None):
+    return torch.empty(size, device="cuda", dtype=torch.int8)
+triton.set_allocator(alloc_fn)
+# os.environ['TRITON_PRINT_AUTOTUNING'] = '1'
+# os.environ['MLIR_ENABLE_DIAGNOSTICS'] = 'warnings,remarks'
+# Reference code in PyTorch
+class TriMul(nn.Module):
+    # Based on https://github.com/lucidrains/triangle-multiplicative-module/blob/main/triangle_multiplicative_module/triangle_multiplicative_module.py
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+    ):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.left_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.to_out_norm = nn.LayerNorm(hidden_dim)
+        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """
+        x: [bs, seq_len, seq_len, dim]
+        mask: [bs, seq_len, seq_len]
+        Returns:
+            output: [bs, seq_len, seq_len, dim]
+        """
+        batch_size, seq_len, _, dim = x.shape
+        x = self.norm(x)
+        left = self.left_proj(x)
+        right = self.right_proj(x)
+        mask = mask.unsqueeze(-1)
+        left = left * mask
+        right = right * mask
+        left_gate = self.left_gate(x).sigmoid()
+        right_gate = self.right_gate(x).sigmoid()
+        out_gate = self.out_gate(x).sigmoid()
+        left = left * left_gate
+        right = right * right_gate
+        out = einsum('... i k d, ... j k d -> ... i j d', left, right)
+        # This einsum is the same as the following:
+        # out = torch.zeros(batch_size, seq_len, seq_len, dim, device=x.device)
+        # # Compute using nested loops
+        # for b in range(batch_size):
+        #     for i in range(seq_len):
+        #         for j in range(seq_len):
+        #             # Compute each output element
+        #             for k in range(seq_len):
+        #                 out[b, i, j] += left[b, i, k, :] * right[b, j, k, :]
+        out = self.to_out_norm(out)
+        out = out * out_gate
+        return self.to_out(out)
+@triton.jit
+def triton_sigmoid(x):
+    """
+    Compute sigmoid function: 1 / (1 + exp(-x))
+    """
+    return 1.0 / (1.0 + tl.exp(-x))
+def two_mm_kernel_configs_wrapper():
+    if torch.cuda.get_device_capability() == (12, 0):
+        def two_mm_kernel_configs():
+            configs = []
+            for BLOCK_M in [16, 32]:
+                for BLOCK_N in [16, 32, 64]:
+                    for BLOCK_K in [16, 32, 64]:
+                        for num_stages in [2, 3]:
+                            configs.append(triton.Config({
+                                'BLOCK_M': BLOCK_M,
+                                'BLOCK_N': BLOCK_N,
+                                'BLOCK_K': BLOCK_K,
+                                'GROUP_SIZE_M': 8
+                            }, num_stages=num_stages, num_warps=8))
+            return configs
+    elif torch.cuda.get_device_capability()[0] == 9:
+        def get_optimal_two_mm_config_h100(B, seq_len, dim):
+            configs = {
+                (1, 128, 128): (128, 64, 128, 2, 8),
+                (1, 128, 256): (128, 64, 128, 2, 8),
+                (1, 128, 384): (128, 64, 64, 3, 8),
+                (1, 128, 512): (128, 64, 64, 3, 8),
+                (1, 128, 768): (128, 64, 64, 3, 8),
+                (1, 128, 1024): (128, 64, 64, 3, 8),
+                (1, 256, 128): (128, 64, 128, 2, 8),
+                (1, 256, 256): (128, 64, 128, 2, 8),
+                (1, 256, 384): (128, 64, 64, 3, 8),
+                (1, 256, 512): (128, 64, 64, 3, 8),
+                (1, 256, 768): (128, 64, 64, 3, 8),
+                (1, 256, 1024): (128, 64, 64, 3, 8),
+                (1, 512, 128): (128, 64, 128, 2, 8),
+                (1, 512, 256): (128, 64, 128, 2, 8),
+                (1, 512, 384): (128, 64, 128, 2, 8),
+                (1, 512, 512): (128, 64, 128, 2, 8),
+                (1, 512, 768): (128, 64, 64, 3, 8),
+                (1, 512, 1024): (128, 64, 64, 3, 8),
+                (1, 1024, 128): (128, 64, 128, 2, 8),
+                (1, 1024, 256): (128, 64, 64, 2, 8),
+                (1, 1024, 384): (128, 64, 128, 2, 8),
+                (1, 1024, 512): (128, 64, 128, 2, 8),
+                (1, 1024, 768): (128, 64, 128, 2, 8),
+                (1, 1024, 1024): (128, 64, 128, 2, 8),
+                (2, 128, 128): (128, 64, 128, 2, 8),
+                (2, 128, 256): (128, 64, 128, 2, 8),
+                (2, 128, 384): (128, 64, 64, 3, 8),
+                (2, 128, 512): (128, 64, 64, 3, 8),
+                (2, 128, 768): (128, 64, 64, 3, 8),
+                (2, 128, 1024): (128, 64, 64, 3, 8),
+                (2, 256, 128): (128, 64, 128, 2, 8),
+                (2, 256, 256): (128, 64, 128, 2, 8),
+                (2, 256, 384): (128, 64, 128, 2, 8),
+                (2, 256, 512): (128, 64, 128, 2, 8),
+                (2, 256, 768): (128, 64, 64, 3, 8),
+                (2, 256, 1024): (128, 64, 64, 3, 8),
+                (2, 512, 128): (128, 64, 128, 2, 8),
+                (2, 512, 256): (128, 64, 128, 2, 8),
+                (2, 512, 384): (128, 64, 128, 2, 8),
+                (2, 512, 512): (128, 64, 128, 2, 8),
+                (2, 512, 768): (128, 64, 128, 2, 8),
+                (2, 512, 1024): (128, 64, 128, 2, 8),
+                (2, 1024, 128): (128, 64, 128, 2, 8),
+                (2, 1024, 256): (128, 64, 128, 2, 8),
+                (2, 1024, 384): (128, 64, 128, 2, 8),
+                (2, 1024, 512): (128, 64, 128, 2, 8),
+                (2, 1024, 768): (128, 64, 128, 2, 8),
+                (2, 1024, 1024): (128, 64, 128, 2, 8),
+            }
+            return configs.get((B, seq_len, dim), (64, 64, 32, 2, 8))  # default fallback
+        def two_mm_kernel_configs():
+            # This function is kept for compatibility but will be overridden for H100
+            return [
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
+            ]
+    elif torch.cuda.get_device_capability()[0] == 10 and False:
+        def get_optimal_two_mm_config(B, seq_len, dim):
+            configs = {
+                (1, 128, 128): (64, 128, 64, 2, 8),
+                (1, 128, 256): (128, 64, 128, 2, 8),
+                (1, 128, 384): (128, 64, 128, 2, 8),
+                (1, 128, 512): (128, 64, 128, 2, 8),
+                (1, 128, 768): (128, 64, 64, 3, 8),
+                (1, 128, 1024): (128, 64, 64, 3, 8),
+                (1, 256, 128): (128, 64, 128, 2, 8),
+                (1, 256, 256): (128, 64, 128, 2, 8),
+                (1, 256, 384): (128, 64, 128, 2, 8),
+                (1, 256, 512): (128, 64, 64, 3, 8),
+                (1, 256, 768): (128, 64, 64, 3, 8),
+                (1, 256, 1024): (128, 64, 64, 3, 8),
+                (1, 512, 128): (128, 64, 128, 2, 8),
+                (1, 512, 256): (128, 64, 128, 2, 8),
+                (1, 512, 384): (128, 64, 128, 2, 8),
+                (1, 512, 512): (128, 64, 128, 2, 8),
+                (1, 512, 768): (128, 64, 64, 3, 8),
+                (1, 512, 1024): (128, 64, 64, 3, 8),
+                (1, 1024, 128): (128, 64, 128, 2, 8),
+                (1, 1024, 256): (128, 64, 128, 2, 8),
+                (1, 1024, 384): (128, 64, 128, 2, 8),
+                (1, 1024, 512): (128, 64, 128, 2, 8),
+                (1, 1024, 768): (128, 64, 64, 3, 8),
+                (1, 1024, 1024): (128, 64, 64, 3, 8),
+                (2, 128, 128): (128, 64, 128, 2, 8),
+                (2, 128, 256): (128, 64, 128, 2, 8),
+                (2, 128, 384): (128, 64, 128, 2, 8),
+                (2, 128, 512): (128, 64, 64, 3, 8),
+                (2, 128, 768): (128, 64, 64, 3, 8),
+                (2, 128, 1024): (128, 64, 64, 3, 8),
+                (2, 256, 128): (128, 64, 128, 2, 8),
+                (2, 256, 256): (128, 64, 128, 2, 8),
+                (2, 256, 384): (128, 64, 128, 2, 8),
+                (2, 256, 512): (128, 64, 64, 3, 8),
+                (2, 256, 768): (128, 64, 64, 3, 8),
+                (2, 256, 1024): (128, 64, 64, 3, 8),
+                (2, 512, 128): (128, 64, 128, 2, 8),
+                (2, 512, 256): (128, 64, 128, 2, 8),
+                (2, 512, 384): (128, 64, 128, 2, 8),
+                (2, 512, 512): (128, 64, 128, 2, 8),
+                (2, 512, 768): (128, 64, 64, 3, 8),
+                (2, 512, 1024): (128, 64, 64, 3, 8),
+                (2, 1024, 128): (128, 64, 128, 2, 8),
+                (2, 1024, 256): (128, 64, 128, 2, 8),
+                (2, 1024, 384): (128, 64, 128, 2, 8),
+                (2, 1024, 512): (128, 64, 128, 2, 8),
+                (2, 1024, 768): (128, 64, 64, 3, 8),
+                (2, 1024, 1024): (128, 64, 64, 3, 8),
+            }
+            return configs.get((B, seq_len, dim), (64, 64, 32, 2, 8))  # default fallback
+        def two_mm_kernel_configs():
+            # This function is kept for compatibility but will be overridden
+            return [
+                triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
+            ]
+    elif torch.cuda.get_device_capability()[0] == 8:
+        # A100
+        def two_mm_kernel_configs():
+            configs = []
+            for BLOCK_M in [64]:
+                for BLOCK_N in [64, 128]:
+                    for BLOCK_K in [16]:
+                        for num_stages in [3, 4]:
+                            for num_warps in [4, 8]:
+                                configs.append(triton.Config({
+                                    'BLOCK_M': BLOCK_M,
+                                    'BLOCK_N': BLOCK_N,
+                                    'BLOCK_K': BLOCK_K,
+                                    'GROUP_SIZE_M': 8
+                                }, num_stages=num_stages, num_warps=num_warps))
+            return configs
+    else:
+        def two_mm_kernel_configs():
+            configs = []
+            for BLOCK_M in [64, 128]:
+                for BLOCK_N in [64, 128]:
+                    for BLOCK_K in [64, 128]:
+                        for num_stages in [2, 3]:
+                            configs.append(triton.Config({
+                                'BLOCK_M': BLOCK_M,
+                                'BLOCK_N': BLOCK_N,
+                                'BLOCK_K': BLOCK_K,
+                                'GROUP_SIZE_M': 8
+                            }, num_stages=num_stages, num_warps=8))
+            return configs
+    return two_mm_kernel_configs
+def two_mm_kernel_wrapper():
+    if torch.cuda.get_device_capability()[0] == 8:
+        @triton.jit
+        def two_mm_kernel(a_ptr, b1_ptr, b2_ptr, b3_ptr, b4_ptr, b5_ptr, c1_ptr, c2_ptr, d_ptr, mask_ptr, M, N, K, stride_a0, stride_a1, stride_a2, stride_a3, stride_bk, stride_bn, stride_c0, stride_c1, stride_c2, stride_c3, seq_len, stride_d0, stride_d1, stride_d2, stride_d3, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
+            # Persistent kernel using standard tl.load operations
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            k_tiles = tl.cdiv(K, BLOCK_K)
+            num_tiles = num_pid_m * num_pid_n
+            # tile_id_c is used in the epilogue to break the dependency between
+            # the prologue and the epilogue
+            tile_id_c = start_pid - NUM_SMS
+            num_pid_in_group = GROUP_SIZE_M * num_pid_n
+            # Persistent loop over tiles
+            for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=False):
+                # Calculate PID for this tile using improved swizzling
+                group_id = tile_id // num_pid_in_group
+                first_pid_m = group_id * GROUP_SIZE_M
+                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+                pid_m = first_pid_m + (tile_id % group_size_m)
+                pid_n = (tile_id % num_pid_in_group) // group_size_m
+                # Calculate block offsets
+                offs_am = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_bn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+                offs_k = tl.arange(0, BLOCK_K)
+                # Initialize accumulators for all outputs
+                accumulator1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator3 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator4 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator_d = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                # Main computation loop over K dimension
+                for ki in range(k_tiles):
+                    k_start = ki * BLOCK_K
+                    k_offsets = k_start + offs_k
+                    # Create pointers for A matrix (2D flattened view)
+                    a_ptrs = a_ptr + offs_am[:, None] * stride_a2 + k_offsets[None, :] * stride_a3
+                    a_mask = (offs_am[:, None] < M) & (k_offsets[None, :] < K)
+                    # Create pointers for B matrices [N, K] layout
+                    b1_ptrs = b1_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b2_ptrs = b2_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b3_ptrs = b3_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b4_ptrs = b4_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b5_ptrs = b5_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b_mask = (offs_bn[:, None] < N) & (k_offsets[None, :] < K)
+                    # Load blocks from A and all weight matrices using standard tl.load
+                    a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+                    b1 = tl.load(b1_ptrs, mask=b_mask, other=0.0)
+                    b2 = tl.load(b2_ptrs, mask=b_mask, other=0.0)
+                    b3 = tl.load(b3_ptrs, mask=b_mask, other=0.0)
+                    b4 = tl.load(b4_ptrs, mask=b_mask, other=0.0)
+                    b5 = tl.load(b5_ptrs, mask=b_mask, other=0.0)
+                    # Perform matrix multiplications using TF32
+                    accumulator1 = tl.dot(a, b1.T, accumulator1, allow_tf32=True)  # A @ B1.T
+                    accumulator2 = tl.dot(a, b2.T, accumulator2, allow_tf32=True)  # A @ B2.T
+                    accumulator3 = tl.dot(a, b3.T, accumulator3, allow_tf32=True)  # A @ B3.T
+                    accumulator4 = tl.dot(a, b4.T, accumulator4, allow_tf32=True)  # A @ B4.T
+                    accumulator_d = tl.dot(a, b5.T, accumulator_d, allow_tf32=True)  # A @ B5.T
+                # Store results using separate tile_id_c for epilogue
+                tile_id_c += NUM_SMS
+                group_id = tile_id_c // num_pid_in_group
+                first_pid_m = group_id * GROUP_SIZE_M
+                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+                pid_m = first_pid_m + (tile_id_c % group_size_m)
+                pid_n = (tile_id_c % num_pid_in_group) // group_size_m
+                # Calculate output offsets and pointers
+                offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+                # Create masks for bounds checking
+                d_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+                # Calculate pointer addresses using 4D strides
+                stride_cm = stride_c2  # Stride to next element in flattened M dimension
+                stride_cn = stride_c3  # N is the innermost dimension
+                # For D tensor: use separate D strides
+                stride_dm = stride_d2  # Stride to next element in flattened M dimension
+                stride_dn = stride_d3  # N is the innermost dimension
+                off_c_batch = offs_cm // (seq_len * seq_len)
+                off_c_sl1 = (offs_cm // seq_len) % seq_len
+                off_c_sl2 = offs_cm % seq_len
+                off_c_dim = offs_cn
+                c_offsets = (off_c_batch * stride_c0 + off_c_sl1 * stride_c1 + off_c_sl2 * stride_c2)[:, None] + off_c_dim[None, :] * stride_c3
+                c_mask = d_mask
+                c1_ptrs = c1_ptr + c_offsets
+                c2_ptrs = c2_ptr + c_offsets
+                d_ptrs = d_ptr + stride_dm * offs_cm[:, None] + stride_dn * offs_cn[None, :]
+                mask = tl.load(mask_ptr + offs_cm, mask=(offs_cm < M))
+                # Broadcast mask to match accumulator dimensions [BLOCK_M, BLOCK_N]
+                mask_2d = mask[:, None]  # Convert to [BLOCK_M, 1] then broadcast
+                # Apply masking only to left_proj and right_proj results (C1, C2)
+                accumulator1 = tl.where(mask_2d, accumulator1, 0)
+                accumulator2 = tl.where(mask_2d, accumulator2, 0)
+                # Apply sigmoid to gate values
+                left_gate_sigmoid = triton_sigmoid(accumulator3)
+                right_gate_sigmoid = triton_sigmoid(accumulator4)
+                accumulator_d = triton_sigmoid(accumulator_d)
+                # Apply elementwise multiplication with gated values
+                # C1 = left * left_gate, C2 = right * right_gate
+                accumulator1 = accumulator1 * left_gate_sigmoid  # left * left_gate
+                accumulator2 = accumulator2 * right_gate_sigmoid  # right * right_gate
+                # Convert to appropriate output dtype and store with normal tl.store
+                c1 = accumulator1.to(c1_ptr.dtype.element_ty)
+                c2 = accumulator2.to(c2_ptr.dtype.element_ty)
+                d = accumulator_d.to(d_ptr.dtype.element_ty)
+                tl.store(c1_ptrs, c1, mask=c_mask)
+                tl.store(c2_ptrs, c2, mask=c_mask)
+                tl.store(d_ptrs, d, mask=d_mask)
+    else:
+        @triton.jit
+        def two_mm_kernel(a_ptr, b1_ptr, b2_ptr, b3_ptr, b4_ptr, b5_ptr, c1_ptr, c2_ptr, d_ptr, mask_ptr, M, N, K, stride_a0, stride_a1, stride_a2, stride_a3, stride_bk, stride_bn, stride_c0, stride_c1, stride_c2, stride_c3, seq_len, stride_d0, stride_d1, stride_d2, stride_d3, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
+            # Persistent kernel using on-device TMA descriptors
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            k_tiles = tl.cdiv(K, BLOCK_K)
+            num_tiles = num_pid_m * num_pid_n
+            # Create on-device TMA descriptors
+            a_desc = tl._experimental_make_tensor_descriptor(
+                a_ptr,
+                shape=[M, K],
+                strides=[stride_a2, stride_a3],
+                block_shape=[BLOCK_M, BLOCK_K],
+            )
+            b1_desc = tl._experimental_make_tensor_descriptor(
+                b1_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            b2_desc = tl._experimental_make_tensor_descriptor(
+                b2_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            b3_desc = tl._experimental_make_tensor_descriptor(
+                b3_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            b4_desc = tl._experimental_make_tensor_descriptor(
+                b4_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            b5_desc = tl._experimental_make_tensor_descriptor(
+                b5_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            # tile_id_c is used in the epilogue to break the dependency between
+            # the prologue and the epilogue
+            tile_id_c = start_pid - NUM_SMS
+            num_pid_in_group = GROUP_SIZE_M * num_pid_n
+            # Persistent loop over tiles
+            for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=False):
+                # Calculate PID for this tile using improved swizzling
+                group_id = tile_id // num_pid_in_group
+                first_pid_m = group_id * GROUP_SIZE_M
+                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+                pid_m = first_pid_m + (tile_id % group_size_m)
+                pid_n = (tile_id % num_pid_in_group) // group_size_m
+                # Calculate block offsets
+                offs_am = pid_m * BLOCK_M
+                offs_bn = pid_n * BLOCK_N
+                # Initialize accumulators for all outputs
+                accumulator1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator3 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator4 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator_d = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                # Main computation loop over K dimension
+                for ki in range(k_tiles):
+                    offs_k = ki * BLOCK_K
+                    # Load blocks from A and all weight matrices using on-device TMA
+                    a = a_desc.load([offs_am, offs_k])
+                    b1 = b1_desc.load([offs_bn, offs_k])
+                    b2 = b2_desc.load([offs_bn, offs_k])
+                    b3 = b3_desc.load([offs_bn, offs_k])
+                    b4 = b4_desc.load([offs_bn, offs_k])
+                    b5 = b5_desc.load([offs_bn, offs_k])
+                    # Perform matrix multiplications using TF32
+                    accumulator1 = tl.dot(a, b1.T, accumulator1, allow_tf32=True)  # A @ B1.T
+                    accumulator2 = tl.dot(a, b2.T, accumulator2, allow_tf32=True)  # A @ B2.T
+                    accumulator3 = tl.dot(a, b3.T, accumulator3, allow_tf32=True)  # A @ B3.T
+                    accumulator4 = tl.dot(a, b4.T, accumulator4, allow_tf32=True)  # A @ B4.T
+                    accumulator_d = tl.dot(a, b5.T, accumulator_d, allow_tf32=True)  # A @ B5.T
+                # Store results using separate tile_id_c for epilogue
+                tile_id_c += NUM_SMS
+                group_id = tile_id_c // num_pid_in_group
+                first_pid_m = group_id * GROUP_SIZE_M
+                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+                pid_m = first_pid_m + (tile_id_c % group_size_m)
+                pid_n = (tile_id_c % num_pid_in_group) // group_size_m
+                # Calculate output offsets and pointers
+                offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+                # Create masks for bounds checking
+                d_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+                # Calculate pointer addresses using 4D strides
+                # For C tensors: compute effective 2D strides from 4D strides
+                # Output tensor is [B, I, J, N], flattened to [M, N] where M = B*I*J
+                stride_cm = stride_c2  # Stride to next element in flattened M dimension
+                stride_cn = stride_c3  # N is the innermost dimension
+                # For D tensor: use separate D strides
+                stride_dm = stride_d2  # Stride to next element in flattened M dimension
+                stride_dn = stride_d3  # N is the innermost dimension
+                off_c_batch = offs_cm // (seq_len * seq_len)
+                off_c_sl1 = (offs_cm // seq_len) % seq_len
+                off_c_sl2 = offs_cm % seq_len
+                off_c_dim = offs_cn
+                # TODO update the mask_c so we don't IMA
+                c_offsets = (off_c_batch * stride_c0 + off_c_sl1 * stride_c1 + off_c_sl2 * stride_c2)[:, None] + off_c_dim[None, :] * stride_c3
+                # c_offsets = offs_cm[:, None] * stride_c2 + offs_cn[None, :] * stride_c3
+                c_mask = d_mask
+                c1_ptrs = c1_ptr + c_offsets
+                c2_ptrs = c2_ptr + c_offsets
+                # c1_ptrs = c1_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+                # c2_ptrs = c2_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+                d_ptrs = d_ptr + stride_dm * offs_cm[:, None] + stride_dn * offs_cn[None, :]
+                mask = tl.load(mask_ptr + offs_cm, mask=(offs_cm < M))
+                # Broadcast mask to match accumulator dimensions [BLOCK_M, BLOCK_N]
+                mask_2d = mask[:, None]  # Convert to [BLOCK_M, 1] then broadcast
+                # Apply masking only to left_proj and right_proj results (C1, C2)
+                accumulator1 = tl.where(mask_2d, accumulator1, 0)
+                accumulator2 = tl.where(mask_2d, accumulator2, 0)
+                # Apply sigmoid to gate values
+                left_gate_sigmoid = triton_sigmoid(accumulator3)
+                right_gate_sigmoid = triton_sigmoid(accumulator4)
+                accumulator_d = triton_sigmoid(accumulator_d)
+                # Apply elementwise multiplication with gated values
+                # C1 = left * left_gate, C2 = right * right_gate
+                accumulator1 = accumulator1 * left_gate_sigmoid  # left * left_gate
+                accumulator2 = accumulator2 * right_gate_sigmoid  # right * right_gate
+                # Convert to appropriate output dtype and store with normal tl.store
+                c1 = accumulator1.to(c1_ptr.dtype.element_ty)
+                c2 = accumulator2.to(c2_ptr.dtype.element_ty)
+                d = accumulator_d.to(d_ptr.dtype.element_ty)
+                tl.store(c1_ptrs, c1, mask=c_mask)
+                tl.store(c2_ptrs, c2, mask=c_mask)
+                tl.store(d_ptrs, d, mask=d_mask)
+    if torch.cuda.get_device_capability()[0] not in [9, 10.2]:
+        two_mm_kernel = triton.autotune(
+            (two_mm_kernel_configs_wrapper())(), key=["M", "N", "K"]
+        )(two_mm_kernel)
+    return two_mm_kernel
+def two_mm(A, left_proj, right_proj, left_gate, right_gate, out_gate, mask):
+    """
+    Persistent matrix multiplication for all weight matrices using on-device TMA descriptors.
+    Args:
+        A: [..., K] tensor (arbitrary leading dimensions)
+        left_proj: [N, K] matrix (will be transposed)
+        right_proj: [N, K] matrix (will be transposed)
+        left_gate: [N, K] left gate weight matrix
+        right_gate: [N, K] right gate weight matrix
+        out_gate: [N, K] output gate weight matrix
+        mask: mask tensor
+    Returns:
+        (C1, C2, D): Tuple of result tensors [..., N] with same leading dims as A
+            C1 = (A @ left_proj.T) * sigmoid(A @ left_gate.T) (masked)
+            C2 = (A @ right_proj.T) * sigmoid(A @ right_gate.T) (masked)
+            D = sigmoid(A @ out_gate.T) (unmasked)
+    """
+    # Check constraints
+    assert A.shape[-1] == left_proj.shape[1] == right_proj.shape[1], "Incompatible K dimensions"
+    assert A.dtype == left_proj.dtype == right_proj.dtype, "Incompatible dtypes"
+    # Assert that all weight matrices have the same strides (same [N, K] shape)
+    assert left_proj.stride() == right_proj.stride() == left_gate.stride() == right_gate.stride() == out_gate.stride(), \
+        "All weight matrices must have identical strides"
+    # Get dimensions
+    original_shape = A.shape[:-1]  # All dimensions except the last
+    K = A.shape[-1]
+    N = left_proj.shape[0]
+    B, seq_len, _, _ = A.shape
+    dtype = A.dtype
+    # Flatten A to 2D for kernel processing
+    A_2d = A.view(-1, K)  # [M, K] where M is product of all leading dims
+    M = A_2d.shape[0]
+    # Get number of streaming multiprocessors
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+    # Launch persistent kernel with limited number of blocks
+    grid = lambda META: (min(NUM_SMS, triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"])),)
+    # Get original 4D strides for A and output tensors
+    A_strides = A.stride()  # (stride_0, stride_1, stride_2, stride_3)
+    # Create output tensors with proper 4D shape to get correct strides
+    output_shape = original_shape + (N,)
+    # C1 = torch.empty(output_shape, device=A.device, dtype=dtype)
+    # C2 = torch.empty(output_shape, device=A.device, dtype=dtype)
+    C1 = torch.empty(B, N, seq_len, seq_len, device=A.device, dtype=torch.float16).permute(0, 2, 3, 1)
+    C2 = torch.empty(B, N, seq_len, seq_len, device=A.device, dtype=torch.float16).permute(0, 2, 3, 1)
+    D = torch.empty(output_shape, device=A.device, dtype=torch.float16)
+    C_strides = C1.stride()  # (stride_0, stride_1, stride_2, stride_3)
+    D_strides = D.stride()   # (stride_0, stride_1, stride_2, stride_3)
+    # Use optimal configuration for B200/H100 or fallback to autotuning for other GPUs
+    if torch.cuda.get_device_capability()[0] == 10:
+        # Get optimal configuration for B200
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps = (two_mm_kernel_configs_wrapper())(B, seq_len, K)
+        grid_size = min(NUM_SMS, triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N))
+        two_mm_kernel_wrapper()[(grid_size,)](
+            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
+            C1, C2, D, mask,
+            M, N, K,
+            *A_strides,  # 4D strides for A
+            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
+            *C_strides,  # 4D strides for C
+            seq_len,
+            *D_strides,  # 4D strides for D
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_SIZE_M=8, NUM_SMS=NUM_SMS,
+            num_stages=num_stages, num_warps=num_warps
+        )
+    elif torch.cuda.get_device_capability()[0] == 9:
+        # Get optimal configuration for H100
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps = (two_mm_kernel_configs_wrapper())(B, seq_len, K)
+        grid_size = min(NUM_SMS, triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N))
+        two_mm_kernel_wrapper()[(grid_size,)](
+            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
+            C1, C2, D, mask,
+            M, N, K,
+            *A_strides,  # 4D strides for A
+            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
+            *C_strides,  # 4D strides for C
+            seq_len,
+            *D_strides,  # 4D strides for D
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_SIZE_M=8, NUM_SMS=NUM_SMS,
+            num_stages=num_stages, num_warps=num_warps
+        )
+    else:
+        # Use autotuning for other GPUs
+        two_mm_kernel_wrapper()[grid](
+            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
+            C1, C2, D, mask,
+            M, N, K,
+            *A_strides,  # 4D strides for A
+            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
+            *C_strides,  # 4D strides for C
+            seq_len,
+            *D_strides,  # 4D strides for D
+            NUM_SMS=NUM_SMS
+        )
+    return C1, C2, D
+def second_layernorm_mul(inp, hidden_dim, weight, bias, mul_operand):
+    ln = torch.nn.functional.layer_norm(inp, (hidden_dim,), eps=1e-5, weight=weight.to(inp.dtype), bias=bias.to(inp.dtype))
+    out = ln * mul_operand
+    return out
+'''
+@triton.autotune(
+    [triton.Config({"ROW_BLOCK_SIZE": 16}, num_warps=4, num_stages=3)],
+    key=["R", "C"]
+)
+'''
+@triton.jit
+def layernorm_kernel_first(
+    X,
+    Y,
+    Weight,
+    Bias,
+    R,
+    C,  # aka "dim"
+    eps,
+    ROW_BLOCK_SIZE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row = tl.program_id(0) * ROW_BLOCK_SIZE + tl.arange(0, ROW_BLOCK_SIZE)
+    cols = tl.arange(0, BLOCK_SIZE)
+    mask_row = row < R
+    mask_col = cols < C
+    # Simple indexing for contiguous data
+    x = tl.load(
+        X + row[:, None] * C + cols[None, :],
+        mask=mask_row[:, None] & mask_col[None, :],
+        other=0.0
+    ).to(tl.float32)
+    weight = tl.load(Weight + cols, mask=mask_col, other=0.0).to(tl.float32)
+    bias = tl.load(Bias + cols, mask=mask_col, other=0.0).to(tl.float32)
+    mean = tl.sum(x, axis=1) / C
+    diff = tl.where(mask_row[:, None] & mask_col[None, :], x - mean[:, None], 0)
+    var = tl.sum(diff * diff, axis=1) / C
+    rstd = 1 / tl.sqrt(var + eps)
+    y_hat = (x - mean[:, None]) * rstd[:, None]
+    y = y_hat * weight[None, :] + bias[None, :]
+    tl.store(
+        Y + row[:, None] * C + cols[None, :],
+        y,
+        mask=mask_row[:, None] & mask_col[None, :]
+    )
+def get_optimal_config_ln(dim):
+    config = None
+    if torch.cuda.get_device_capability()[0] == 9:
+        if (dim <= 256):
+            config = (16, 1)
+        elif dim <= 512:
+            config = (16, 2)
+        elif dim <= 1024:
+            config = (16, 4)
+    if not config:
+        config = (16, 4)
+    return config
+def triton_layernorm_first(x, weight, bias, eps=1e-5, num_warps=None, ROW_BLOCK_SIZE=None):
+    B, seq_len, seq_len2, dim = x.shape
+    assert(seq_len == seq_len2)
+    R = B * seq_len * seq_len
+    C = dim
+    out = torch.empty_like(x, dtype=torch.float16)
+    if not num_warps or not ROW_BLOCK_SIZE:
+        ROW_BLOCK_SIZE, num_warps = get_optimal_config_ln(dim)
+    BLOCK_SIZE = triton.next_power_of_2(C)
+    assert(BLOCK_SIZE <= 1024)
+    def grid(meta):
+        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
+    layernorm_kernel_first[grid](
+        x, out, weight, bias,
+        R, C, eps,
+        ROW_BLOCK_SIZE=ROW_BLOCK_SIZE,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+        num_stages=3
+    )
+    return out
+'''
+def triton_layernorm_first(x, weight, bias, eps=1e-5):
+    B, seq_len, seq_len2, dim = x.shape
+    assert(seq_len == seq_len2)
+    R = B * seq_len * seq_len
+    C = dim
+    out = torch.empty_like(x)
+    BLOCK_SIZE = triton.next_power_of_2(C)
+    assert(BLOCK_SIZE <= 1024)
+    def grid(meta):
+        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
+    layernorm_kernel_first[grid](
+        x, out, weight, bias,
+        R, C, eps,
+        BLOCK_SIZE=BLOCK_SIZE
+    )
+    return out
+'''
+@triton.autotune(
+    [triton.Config({"ROW_BLOCK_SIZE": 16}, num_warps=1, num_stages=3)],
+    key=[]
+)
+@triton.jit
+def layernorm_kernel_eltwise(
+    X,
+    Y,
+    Weight,
+    Bias,
+    OutGate,
+    seq_len,
+    stride_batch,
+    stride_dim,
+    R,
+    C,  # aka "dim"
+    eps,
+    ROW_BLOCK_SIZE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row = tl.program_id(0) * ROW_BLOCK_SIZE + tl.arange(0, ROW_BLOCK_SIZE)
+    cols = tl.arange(0, BLOCK_SIZE)
+    # Calculate base pointer for this batch of rows
+    tl.device_assert(seq_len*seq_len % ROW_BLOCK_SIZE == 0)
+    # batch_offset = (row // (stride_seq1 // stride_dim)) * stride_batch
+    batch = tl.program_id(0) * ROW_BLOCK_SIZE // (seq_len * seq_len)
+    seqs_off = row % (seq_len * seq_len) # TODO is this going to prevent vectorization
+    off_r = batch * stride_batch + seqs_off
+    off_c = cols * stride_dim
+    mask_row = row < R
+    mask_col = cols < C
+    out_gate = tl.load(
+        OutGate + row[:, None] * C + cols[None, :],
+        mask = mask_row[:, None] & mask_col[None, :],
+    )
+    x = tl.load(
+        X + off_r[:, None] + off_c[None, :],
+        mask=mask_row[:, None] & mask_col[None, :],
+        other=0.0
+    ).to(tl.float32)
+    weight = tl.load(Weight + cols, mask=mask_col, other=0.0).to(tl.float32)
+    bias = tl.load(Bias + cols, mask=mask_col, other=0.0).to(tl.float32)
+    mean = tl.sum(x, axis=1) / C
+    diff = tl.where(mask_row[:, None] & mask_col[None, :], x - mean[:, None], 0)
+    var = tl.sum(diff * diff, axis=1) / C
+    rstd = 1 / tl.sqrt(var + eps)
+    y_hat = (x - mean[:, None]) * rstd[:, None]
+    y = y_hat * weight[None, :] + bias[None, :]
+    tl.store(
+        Y + row[:, None] * C + cols[None, :],
+        y * out_gate,
+        mask=mask_row[:, None] & mask_col[None, :]
+    )
+def triton_layernorm_eltwise(x, weight, bias, out_gate, eps=1e-5):
+    B, seq_len, seq_len2, dim = x.shape
+    assert(seq_len == seq_len2)
+    R = B * seq_len * seq_len
+    assert(x.stride(3) == seq_len*seq_len)
+    assert(out_gate.is_contiguous())
+    C = dim
+    out = torch.empty_like(out_gate, dtype=torch.float32)
+    BLOCK_SIZE = triton.next_power_of_2(C)
+    assert(BLOCK_SIZE == 128)
+    def grid(meta):
+        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
+    layernorm_kernel_eltwise[grid](
+        x, out, weight, bias, out_gate,
+        seq_len,
+        x.stride(0), x.stride(3),
+        R, C, eps,
+        BLOCK_SIZE=BLOCK_SIZE
+    )
+    return out
+def kernel_global(data: input_t) -> output_t:
+    """
+    Reference implementation of TriMul using PyTorch.
+    Args:
+        data: Tuple of (input: torch.Tensor, mask: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+            - input: Input tensor of shape [batch_size, seq_len, seq_len, dim]
+            - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
+            - weights: Dictionary containing model weights
+            - config: Dictionary containing model configuration parameters
+    """
+    input_tensor, mask, weights, config = data
+    left_proj_weight = weights["left_proj.weight"].to(torch.float16)
+    right_proj_weight = weights["right_proj.weight"].to(torch.float16)
+    left_gate_weight = weights["left_gate.weight"].to(torch.float16)
+    right_gate_weight = weights["right_gate.weight"].to(torch.float16)
+    out_gate_weight = weights["out_gate.weight"].to(torch.float16)
+    hidden_dim = config["hidden_dim"]
+    # trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
+    x = input_tensor
+    batch_size, seq_len, _, dim = x.shape
+    x = triton_layernorm_first(x, weights['norm.weight'], weights['norm.bias'])
+    # x = torch.nn.functional.layer_norm(x, (dim,), eps=1e-5, weight=weights['norm.weight'], bias=weights['norm.bias'])
+    left, right, out_gate = two_mm(x, left_proj_weight, right_proj_weight, left_gate_weight, right_gate_weight, out_gate_weight, mask)
+    # left = torch.nn.functional.linear(x, weights['left_proj.weight'].to(torch.float16))
+    # right = torch.nn.functional.linear(x, weights['right_proj.weight'].to(torch.float16))
+    # left = left * mask.unsqueeze(-1)
+    # right = right * mask.unsqueeze(-1)
+    '''
+    left = left.to(torch.float32)
+    right = right.to(torch.float32)
+    x = x.to(torch.float32)
+    left_gate = left_gate.sigmoid()
+    right_gate = right_gate.sigmoid()
+    out_gate = out_gate.sigmoid()
+    '''
+    # Elementwise multiplication now handled in kernel
+    # left = left * left_gate
+    # right = right * right_gate
+    # out = einsum('... i k d, ... j k d -> ... i j d', left, right)
+    out = torch.bmm(left.permute(0, 3, 1, 2).view(-1, left.shape[1], left.shape[2]), right.permute(0, 3, 2, 1).view(-1, right.shape[2], right.shape[1]))
+    out = out.view(batch_size, hidden_dim, seq_len, seq_len).permute(0, 2, 3, 1)
+    # out = torch.compile(second_layernorm_mul, dynamic=False)(out, hidden_dim, weights['to_out_norm.weight'], weights['to_out_norm.bias'], out_gate)
+    out = triton_layernorm_eltwise(out, weights['to_out_norm.weight'], weights['to_out_norm.bias'], out_gate)
+    # out = torch.nn.functional.layer_norm(out, (hidden_dim,), eps=1e-5, weight=weights['to_out_norm.weight'].to(out.dtype), bias=weights['to_out_norm.bias'].to(out.dtype))
+    # out = out * out_gate
+    return torch.nn.functional.linear(out, weights['to_out.weight'])
+    '''
+    # Fill in the given weights of the model
+    trimul.norm.weight = nn.Parameter(weights['norm.weight'])
+    trimul.norm.bias = nn.Parameter(weights['norm.bias'])
+    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
+    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
+    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
+    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
+    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
+    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
+    trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'])
+    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
+    output = trimul(input_tensor, mask)
+    return output
+    '''

build/torch-cuda/trimul_gpumode/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import sys
+import importlib
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch-cuda/trimul_mi300.py ADDED Viewed

	@@ -0,0 +1,524 @@

+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16},  num_warps=4, num_stages=2),
+        # Configurations with larger block sizes for better data reuse
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=2),
+        # Configurations with deeper K dimension
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=2),
+        # More extreme configurations to test the limits
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=4, num_stages=2),
+        # Configurations with fewer warps
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=2, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=3),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def fused_ln_dual_matmul_kernel(
+    # Pointers (9)
+    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
+    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
+    # Metadata (5)
+    M, H, K, s1, s2,
+    # Strides (16)
+    stride_x_m, stride_x_k,
+    stride_w4_k, stride_w4_n,
+    stride_wog_k, stride_wog_n,
+    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
+    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
+    stride_og_m, stride_og_h,
+    stride_mask_m, stride_mask_h,
+    # Constexpr (from decorator and kwargs)
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
+):
+    # --- PID Mapping: Based on the LARGER 4*H problem ---
+    pid = tl.program_id(axis=0)
+    N_4way = 4 * H
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # --- SHARED LayerNorm calculation (done only ONCE) ---
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    m_mask = offs_m < M
+    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
+    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        mean += tl.sum(x_chunk, axis=1)
+    mean /= K
+    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        x_centered = x_chunk - mean[:, None]
+        var += tl.sum(x_centered * x_centered, axis=1)
+    var /= K
+    rstd = 1.0 / tl.sqrt(var + LN_EPS)
+    # --- Matmul Loop 1: For the 4-Way Projections ---
+    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
+    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_block_start = k * BLOCK_SIZE_K;
+        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
+        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
+        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
+        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
+        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
+        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
+        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
+        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
+        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+        accumulator_4way += tl.dot(x_norm_tile, w_tile)
+        #Some threads should calclate out_gate
+        if pid_n * BLOCK_SIZE_N < H:
+            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
+            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
+            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
+            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+            accumulator_og += tl.dot(x_norm_tile, w_tile)
+    if pid_n * BLOCK_SIZE_N < H:
+        og_out = tl.sigmoid(accumulator_og)
+        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
+        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
+        tl.store(outg_ptrs, og_out, mask=og_mask)
+    # --- Fusion Logic for 4-Way Part ---
+    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
+    role_idx = tl.arange(0, 4)[None, None, :]
+    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
+    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
+    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
+    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
+    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
+    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
+    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
+    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
+    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
+    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
+    s1s2 = s1 * s2
+    offs_b  = offs_m // s1s2
+    offs_s1 = (offs_m % s1s2) // s2
+    offs_s2 = offs_m % s2
+    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
+    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
+    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
+    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
+    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
+                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
+    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
+                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1) # s2 offset uses s2 stride, s1 offset uses s1 stride
+    tl.store(outl_ptrs, left_out, mask=m_mask_h)
+    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
+    ],
+    key=['s1', 's2', 'H'],
+)
+@triton.jit
+def bmm_coalesced_kernel(
+    # Pointers
+    Left_ptr, Right_ptr, Out_ptr,
+    # Dimensions
+    bs, s1, s2, H,
+    # Strides
+    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
+    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
+    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
+    # Kernel parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # Grid and program IDs
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_bh = tl.program_id(axis=1)
+    pid_b = pid_bh // H
+    pid_h = pid_bh % H
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
+    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
+        k_start = k * BLOCK_SIZE_K
+        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
+        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
+        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
+        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        accumulator += tl.dot(a, b)
+    # --- Coalesced Write ---
+    # Write to a standard (bs, H, s1, s1) layout
+    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
+               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
+    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
+    tl.store(out_ptrs, accumulator, mask=c_mask)
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
+    ],
+    key=['H', 'D'],
+)
+@triton.jit
+def fused_final_kernel(
+    # Pointers
+    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
+    # Metadata
+    M, H, D, s1, # M_gate = bs*s1*s2
+    # Strides
+    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
+    stride_gate_m, stride_gate_h,
+    stride_proj_d, stride_proj_h,
+    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
+    # Constants
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # --- Grid and PID Setup for Matmul ---
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    m_mask = offs_m < M
+    # Decompose M back to (b, r, c) for reordering lookups
+    s1s1 = s1 * s1
+    b = offs_m // s1s1
+    r = (offs_m % s1s1) // s1
+    c = offs_m % s1
+    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
+        # Accumulate sum and sum of squares in one pass
+        sum_x += tl.sum(in_chunk, axis=1)
+        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
+    # Finalize statistics
+    mean = sum_x / H
+    var = (sum_x2 / H) - (mean * mean)
+    rstd = tl.math.rsqrt(var + LN_EPS)
+    # --- Pass 3: Fused Gating and Matmul ---
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_norm = (a - mean[:, None]) * rstd[:, None]
+        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
+        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
+        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
+        proj_ptrs = ProjW_ptr + \
+                    offs_n[None, :] * stride_proj_d + \
+                    offs_k[:, None] * stride_proj_h
+        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
+        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_gated = a_norm * gate
+        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
+        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
+    # --- Store Final Output ---
+    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
+    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
+    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
+def compiledtrimul_fused_interleaved(
+    x: torch.Tensor,
+    mask_mh: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    W_4way: torch.Tensor, # Use the new weight matrices
+    W_og: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int,
+):
+    bs, s1, s2, d = x.shape
+    M, K, H = bs * s1 * s2, x.shape[-1], h
+    x_flat = x.view(M, K)
+    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
+    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
+    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
+    # The grid is launched for the larger 4*H problem
+    N_4way = 4 * H
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
+    fused_ln_dual_matmul_kernel[grid](
+        # Pointers (9)
+        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
+        left_final, right_final_t, og_mh,
+        # Metadata (5) - M, H, K, s1, s2
+        M, H, K, s1, s2,
+        # Strides (16)
+        x_flat.stride(0), x_flat.stride(1),
+        W_4way.stride(0), W_4way.stride(1),
+        W_og.stride(0), W_og.stride(1),
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        og_mh.stride(0), og_mh.stride(1),
+        mask_mh.stride(0), mask_mh.stride(1),
+        # Constexpr (1)
+        LN_EPS=1e-5
+    )
+    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
+    grid_bmm = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
+    bmm_coalesced_kernel[grid_bmm](
+        left_final, right_final_t, bmm_out_tmp,
+        bs, s1, s2, H,
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+    )
+    # --- Kernel 3: Fully Fused Final Stage ---
+    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
+    grid_final = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
+    fused_final_kernel[grid_final](
+        # Pointers
+        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
+        # Metadata
+        M, H, d, s1,
+        # Strides
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        og_mh.stride(0), og_mh.stride(1),
+        to_out_weight.stride(0), to_out_weight.stride(1), # Use strides of the corrected tensor
+        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
+        # Constants
+        LN_EPS=1e-5,
+    )
+    return final_out
+def pack_w_4way_efficient(weights):
+    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
+    WL = weights['left_proj.weight']
+    WLG = weights['left_gate.weight']
+    WR = weights['right_proj.weight']
+    WRG = weights['right_gate.weight']
+    H, K = WL.shape
+    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2)
+    ws = ws.contiguous().view(4 * H, K)
+    return ws.t().to(torch.float16)
+def get_w_og(weights):
+    """ Gets the transposed [K, H] out_gate weight matrix. """
+    WOG = weights['out_gate.weight']
+    return WOG.t().to(torch.float16)
+def compiledtrimul(
+    x: torch.Tensor,
+    mask: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    w_concat: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int
+) -> torch.Tensor:
+    """
+    A barebones, compiled PyTorch function for the TriMul logic.
+    """
+    bs, s1, s2, d = x.shape
+    # Initial LayerNorm
+    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
+    # Single large matmul: [M, d] @ [d, 5h] = [M, 5h]
+    all_projections = torch.mm(x_norm, w_concat)
+    # Split back into individual projections
+    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
+    # Apply mask and gates
+    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
+    left = left * mask_expanded * torch.sigmoid(lg)
+    right = right * mask_expanded * torch.sigmoid(rg)
+    out_gate = torch.sigmoid(og)
+    # Reshape for einsum
+    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
+    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
+    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
+    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
+    # Apply layer norm and final gating
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * out_gate
+    # Final projection
+    final_out_flat = gated @ to_out_weight.t()
+    final_out = final_out_flat.view(bs, s1, s2, d)
+    return final_out
+def small_kernel_pt_path(data):
+    input_tensor, mask, weights, config = data
+    w_concat = torch.cat([
+        weights['left_proj.weight'],
+        weights['right_proj.weight'],
+        weights['left_gate.weight'],
+        weights['right_gate.weight'],
+        weights['out_gate.weight']
+    ], dim=0).t().contiguous().to(torch.float16)
+    # Call the compiled function with prepared weights
+    output = compiledtrimul(
+        x=input_tensor.to(torch.float32),
+        mask=mask.unsqueeze(-1),
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        w_concat=w_concat,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=config["hidden_dim"]
+    )
+    return output
+def kernel_mi300(data):
+    input_tensor, mask, weights, config = data
+    bs, s1, s2, d = input_tensor.shape
+    if s1 < 100:
+        return small_kernel_pt_path(data)
+    H = config["hidden_dim"]
+    W_4way = pack_w_4way_efficient(weights)
+    W_og = get_w_og(weights)
+    M = bs * s1 * s2
+    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16) #move into kernel possibly
+    return compiledtrimul_fused_interleaved(
+        x=input_tensor.to(torch.float32),
+        mask_mh=mask_mh,
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        W_4way=W_4way, # Pass the new 4-way matrix
+        W_og=W_og,     # Pass the new out_gate matrix
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=H,
+    )

build/torch-cuda/triton_a100.py ADDED Viewed

	@@ -0,0 +1,405 @@

+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+# Set PyTorch flags for performance
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@triton.jit
+def fused_ln_dual_matmul_kernel(
+    # Pointers (9)
+    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
+    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
+    # Metadata (5)
+    M, H, K, s1, s2,
+    # Strides (16)
+    stride_x_m, stride_x_k,
+    stride_w4_k, stride_w4_n,
+    stride_wog_k, stride_wog_n,
+    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
+    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
+    stride_og_m, stride_og_h,
+    stride_mask_m, stride_mask_h,
+    # Constexpr (now passed as arguments from the host)
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
+):
+    # --- PID Mapping: Based on the LARGER 4*H problem ---
+    pid = tl.program_id(axis=0)
+    N_4way = 4 * H
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # --- SHARED LayerNorm calculation (done only ONCE) ---
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    m_mask = offs_m < M
+    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
+    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        mean += tl.sum(x_chunk, axis=1)
+    mean /= K
+    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        x_centered = x_chunk - mean[:, None]
+        var += tl.sum(x_centered * x_centered, axis=1)
+    var /= K
+    rstd = 1.0 / tl.sqrt(var + LN_EPS)
+    # --- Matmul Loop 1: For the 4-Way Projections ---
+    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
+    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_block_start = k * BLOCK_SIZE_K;
+        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
+        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
+        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
+        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
+        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
+        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
+        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
+        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
+        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+        accumulator_4way += tl.dot(x_norm_tile, w_tile)
+        if pid_n * BLOCK_SIZE_N < H:
+            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
+            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
+            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
+            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+            accumulator_og += tl.dot(x_norm_tile, w_tile)
+    if pid_n * BLOCK_SIZE_N < H:
+        og_out = tl.sigmoid(accumulator_og)
+        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
+        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
+        tl.store(outg_ptrs, og_out, mask=og_mask)
+    # --- Fusion Logic for 4-Way Part ---
+    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
+    role_idx = tl.arange(0, 4)[None, None, :]
+    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
+    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
+    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
+    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
+    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
+    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
+    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
+    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
+    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
+    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
+    s1s2 = s1 * s2
+    offs_b  = offs_m // s1s2
+    offs_s1 = (offs_m % s1s2) // s2
+    offs_s2 = offs_m % s2
+    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
+    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
+    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
+    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
+    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
+                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
+    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
+                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1)
+    tl.store(outl_ptrs, left_out, mask=m_mask_h)
+    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
+@triton.jit
+def bmm_coalesced_kernel(
+    # Pointers
+    Left_ptr, Right_ptr, Out_ptr,
+    # Dimensions
+    bs, s1, s2, H,
+    # Strides
+    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
+    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
+    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
+    # Kernel parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_bh = tl.program_id(axis=1)
+    pid_b = pid_bh // H
+    pid_h = pid_bh % H
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
+    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
+        k_start = k * BLOCK_SIZE_K
+        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
+        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
+        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
+        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        accumulator += tl.dot(a, b)
+    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
+               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
+    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
+    tl.store(out_ptrs, accumulator, mask=c_mask)
+@triton.jit
+def fused_final_kernel(
+    # Pointers
+    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
+    # Metadata
+    M, H, D, s1,
+    # Strides
+    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
+    stride_gate_m, stride_gate_h,
+    stride_proj_d, stride_proj_h,
+    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
+    # Constants
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    m_mask = offs_m < M
+    s1s1 = s1 * s1
+    b = offs_m // s1s1
+    r = (offs_m % s1s1) // s1
+    c = offs_m % s1
+    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
+        sum_x += tl.sum(in_chunk, axis=1)
+        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
+    mean = sum_x / H
+    var = (sum_x2 / H) - (mean * mean)
+    rstd = tl.math.rsqrt(var + LN_EPS)
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_norm = (a - mean[:, None]) * rstd[:, None]
+        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
+        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
+        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
+        proj_ptrs = ProjW_ptr + offs_n[None, :] * stride_proj_d + offs_k[:, None] * stride_proj_h
+        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
+        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_gated = a_norm * gate
+        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
+        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
+    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
+    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
+    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
+def compiledtrimul_fused_interleaved_final(
+    x: torch.Tensor,
+    mask_mh: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    W_4way: torch.Tensor,
+    W_og: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int,
+):
+    bs, s1, s2, d = x.shape
+    M, K, H = bs * s1 * s2, x.shape[-1], h
+    x_flat = x.view(M, K)
+    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
+    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
+    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
+    # --- Kernel 1: Fused LN + Dual Matmul ---
+    N_4way = 4 * H
+    # Hardcoded A100 best config: M128-N128-K32-GM8-HC32-W8-S2
+    config_k1 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}
+    grid_k1 = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
+    fused_ln_dual_matmul_kernel[grid_k1](
+        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
+        left_final, right_final_t, og_mh,
+        M, H, K, s1, s2,
+        x_flat.stride(0), x_flat.stride(1), W_4way.stride(0), W_4way.stride(1),
+        W_og.stride(0), W_og.stride(1), left_final.stride(0), left_final.stride(1),
+        left_final.stride(2), left_final.stride(3), right_final_t.stride(0), right_final_t.stride(1),
+        right_final_t.stride(2), right_final_t.stride(3), og_mh.stride(0), og_mh.stride(1),
+        mask_mh.stride(0), mask_mh.stride(1),
+        LN_EPS=1e-5, **config_k1, num_warps=8, num_stages=2
+    )
+    # --- Kernel 2: Batched Matrix Multiplication ---
+    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
+    # Hardcoded A100 best config: M128-N64-K32-GM8-W4-S3
+    config_k2 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
+    grid_k2 = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
+    bmm_coalesced_kernel[grid_k2](
+        left_final, right_final_t, bmm_out_tmp,
+        bs, s1, s2, H,
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        **config_k2, num_warps=4, num_stages=3
+    )
+    # --- Kernel 3: Fully Fused Final Stage ---
+    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
+    # Hardcoded A100 best config: M32-N128-K32-GM8-W4-S3
+    config_k3 = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
+    grid_k3 = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
+    fused_final_kernel[grid_k3](
+        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
+        M, H, d, s1,
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        og_mh.stride(0), og_mh.stride(1), to_out_weight.stride(0), to_out_weight.stride(1),
+        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
+        LN_EPS=1e-5, **config_k3, num_warps=4, num_stages=3
+    )
+    return final_out
+def pack_w_4way_efficient(weights):
+    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
+    WL, WLG, WR, WRG = (weights[k] for k in ['left_proj.weight', 'left_gate.weight', 'right_proj.weight', 'right_gate.weight'])
+    H, K = WL.shape
+    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2).contiguous().view(4 * H, K)
+    return ws.t().to(torch.float16)
+def get_w_og(weights):
+    """ Gets the transposed [K, H] out_gate weight matrix. """
+    return weights['out_gate.weight'].t().to(torch.float16)
+@torch.compile()
+def compiledtrimul(
+    x: torch.Tensor, mask: torch.Tensor, norm_weight: torch.Tensor, norm_bias: torch.Tensor,
+    w_concat: torch.Tensor, to_out_norm_weight: torch.Tensor, to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor, h: int
+) -> torch.Tensor:
+    bs, s1, s2, d = x.shape
+    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
+    all_projections = torch.mm(x_norm, w_concat)
+    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
+    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
+    left = left * mask_expanded * torch.sigmoid(lg)
+    right = right * mask_expanded * torch.sigmoid(rg)
+    out_gate = torch.sigmoid(og)
+    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
+    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
+    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
+    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * out_gate
+    final_out_flat = gated @ to_out_weight.t()
+    return final_out_flat.view(bs, s1, s1, d)
+def small_kernel_pt_path(data):
+    input_tensor, mask, weights, config = data
+    w_concat = torch.cat([
+        weights['left_proj.weight'], weights['right_proj.weight'], weights['left_gate.weight'],
+        weights['right_gate.weight'], weights['out_gate.weight']
+    ], dim=0).t().contiguous().to(torch.float16)
+    return compiledtrimul(
+        x=input_tensor.to(torch.float32), mask=mask.unsqueeze(-1),
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32), w_concat=w_concat,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=config["hidden_dim"]
+    )
+def kernel_a100(data):
+    input_tensor, mask, weights, config = data
+    bs, s1, s2, d = input_tensor.shape
+    if s1 < 512: # Adjusted threshold based on observed BMM configs
+        return small_kernel_pt_path(data)
+    H = config["hidden_dim"]
+    W_4way = pack_w_4way_efficient(weights)
+    W_og = get_w_og(weights)
+    M = bs * s1 * s2
+    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16)
+    return compiledtrimul_fused_interleaved_final(
+        x=input_tensor.to(torch.float32),
+        mask_mh=mask_mh,
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        W_4way=W_4way,
+        W_og=W_og,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=H,
+    )

build/torch-cuda/triton_b200.py ADDED Viewed

	@@ -0,0 +1,411 @@

+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@triton.jit
+def fused_ln_dual_matmul_kernel(
+    # Pointers (9)
+    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
+    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
+    # Metadata (5)
+    M, H, K, s1, s2,
+    # Strides (16)
+    stride_x_m, stride_x_k,
+    stride_w4_k, stride_w4_n,
+    stride_wog_k, stride_wog_n,
+    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
+    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
+    stride_og_m, stride_og_h,
+    stride_mask_m, stride_mask_h,
+    # Constexpr (now passed as arguments from the host)
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
+):
+    # --- PID Mapping: Based on the LARGER 4*H problem ---
+    pid = tl.program_id(axis=0)
+    N_4way = 4 * H
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # --- SHARED LayerNorm calculation (done only ONCE) ---
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    m_mask = offs_m < M
+    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
+    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        mean += tl.sum(x_chunk, axis=1)
+    mean /= K
+    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        x_centered = x_chunk - mean[:, None]
+        var += tl.sum(x_centered * x_centered, axis=1)
+    var /= K
+    rstd = 1.0 / tl.sqrt(var + LN_EPS)
+    # --- Matmul Loop 1: For the 4-Way Projections ---
+    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
+    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_block_start = k * BLOCK_SIZE_K;
+        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
+        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
+        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
+        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
+        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
+        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
+        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
+        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
+        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+        accumulator_4way += tl.dot(x_norm_tile, w_tile)
+        #Some threads should calclate out_gate
+        if pid_n * BLOCK_SIZE_N < H:
+            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
+            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
+            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
+            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+            accumulator_og += tl.dot(x_norm_tile, w_tile)
+    if pid_n * BLOCK_SIZE_N < H:
+        og_out = tl.sigmoid(accumulator_og)
+        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
+        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
+        tl.store(outg_ptrs, og_out, mask=og_mask)
+    # --- Fusion Logic for 4-Way Part ---
+    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
+    role_idx = tl.arange(0, 4)[None, None, :]
+    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
+    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
+    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
+    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
+    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
+    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
+    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
+    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
+    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
+    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
+    s1s2 = s1 * s2
+    offs_b  = offs_m // s1s2
+    offs_s1 = (offs_m % s1s2) // s2
+    offs_s2 = offs_m % s2
+    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
+    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
+    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
+    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
+    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
+                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
+    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
+                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1)
+    tl.store(outl_ptrs, left_out, mask=m_mask_h)
+    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
+@triton.jit
+def bmm_coalesced_kernel(
+    # Pointers
+    Left_ptr, Right_ptr, Out_ptr,
+    # Dimensions
+    bs, s1, s2, H,
+    # Strides
+    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
+    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
+    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
+    # Kernel parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # Grid and program IDs
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_bh = tl.program_id(axis=1)
+    pid_b = pid_bh // H
+    pid_h = pid_bh % H
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
+    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
+        k_start = k * BLOCK_SIZE_K
+        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
+        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
+        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
+        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        accumulator += tl.dot(a, b)
+    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
+               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
+    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
+    tl.store(out_ptrs, accumulator, mask=c_mask)
+@triton.jit
+def fused_final_kernel(
+    # Pointers
+    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
+    # Metadata
+    M, H, D, s1,
+    # Strides
+    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
+    stride_gate_m, stride_gate_h,
+    stride_proj_d, stride_proj_h,
+    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
+    # Constants
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    m_mask = offs_m < M
+    s1s1 = s1 * s1
+    b = offs_m // s1s1
+    r = (offs_m % s1s1) // s1
+    c = offs_m % s1
+    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
+        sum_x += tl.sum(in_chunk, axis=1)
+        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
+    mean = sum_x / H
+    var = (sum_x2 / H) - (mean * mean)
+    rstd = tl.math.rsqrt(var + LN_EPS)
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_norm = (a - mean[:, None]) * rstd[:, None]
+        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
+        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
+        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
+        proj_ptrs = ProjW_ptr + offs_n[None, :] * stride_proj_d + offs_k[:, None] * stride_proj_h
+        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
+        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_gated = a_norm * gate
+        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
+        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
+    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
+    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
+    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
+def compiledtrimul_fused_interleaved_final(
+    x: torch.Tensor,
+    mask_mh: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    W_4way: torch.Tensor,
+    W_og: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int,
+):
+    bs, s1, s2, d = x.shape
+    M, K, H = bs * s1 * s2, x.shape[-1], h
+    x_flat = x.view(M, K)
+    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
+    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
+    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
+    # --- Kernel 1: Fused LN + Dual Matmul ---
+    # The grid is launched for the larger 4*H problem
+    N_4way = 4 * H
+    # Hardcoded best config from logs: M64-N128-K64-GM8-HC32-W4-S2
+    config_k1 = {'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
+    fused_ln_dual_matmul_kernel[grid](
+        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
+        left_final, right_final_t, og_mh,
+        M, H, K, s1, s2,
+        x_flat.stride(0), x_flat.stride(1), W_4way.stride(0), W_4way.stride(1),
+        W_og.stride(0), W_og.stride(1), left_final.stride(0), left_final.stride(1),
+        left_final.stride(2), left_final.stride(3), right_final_t.stride(0), right_final_t.stride(1),
+        right_final_t.stride(2), right_final_t.stride(3), og_mh.stride(0), og_mh.stride(1),
+        mask_mh.stride(0), mask_mh.stride(1),
+        LN_EPS=1e-5, **config_k1, num_warps=4, num_stages=2
+    )
+    # --- Kernel 2: Batched Matrix Multiplication ---
+    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
+    # Hardcoded best config from logs: M128-N128-K32-GM8-W8-S3
+    config_k2 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
+    grid_bmm = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
+    bmm_coalesced_kernel[grid_bmm](
+        left_final, right_final_t, bmm_out_tmp,
+        bs, s1, s2, H,
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        **config_k2, num_warps=8, num_stages=3
+    )
+    # --- Kernel 3: Fully Fused Final Stage ---
+    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
+    # Hardcoded best config from logs: M32-N128-K32-GM8-W4-S3
+    config_k3 = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
+    grid_final = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
+    fused_final_kernel[grid_final](
+        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
+        M, H, d, s1,
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        og_mh.stride(0), og_mh.stride(1), to_out_weight.stride(0), to_out_weight.stride(1),
+        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
+        LN_EPS=1e-5, **config_k3, num_warps=4, num_stages=3
+    )
+    return final_out
+def pack_w_4way_efficient(weights):
+    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
+    WL, WLG, WR, WRG = (weights[k] for k in ['left_proj.weight', 'left_gate.weight', 'right_proj.weight', 'right_gate.weight'])
+    H, K = WL.shape
+    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2).contiguous().view(4 * H, K)
+    return ws.t().to(torch.float16)
+def get_w_og(weights):
+    """ Gets the transposed [K, H] out_gate weight matrix. """
+    return weights['out_gate.weight'].t().to(torch.float16)
+@torch.compile()
+def compiledtrimul(
+    x: torch.Tensor, mask: torch.Tensor, norm_weight: torch.Tensor, norm_bias: torch.Tensor,
+    w_concat: torch.Tensor, to_out_norm_weight: torch.Tensor, to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor, h: int
+) -> torch.Tensor:
+    bs, s1, s2, d = x.shape
+    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
+    all_projections = torch.mm(x_norm, w_concat)
+    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
+    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
+    left = left * mask_expanded * torch.sigmoid(lg)
+    right = right * mask_expanded * torch.sigmoid(rg)
+    out_gate = torch.sigmoid(og)
+    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
+    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
+    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
+    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * out_gate
+    final_out_flat = gated @ to_out_weight.t()
+    return final_out_flat.view(bs, s1, s1, d)
+def small_kernel_pt_path(data):
+    input_tensor, mask, weights, config = data
+    w_concat = torch.cat([
+        weights['left_proj.weight'], weights['right_proj.weight'], weights['left_gate.weight'],
+        weights['right_gate.weight'], weights['out_gate.weight']
+    ], dim=0).t().contiguous().to(torch.float16)
+    return compiledtrimul(
+        x=input_tensor.to(torch.float32), mask=mask.unsqueeze(-1),
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32), w_concat=w_concat,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=config["hidden_dim"]
+    )
+def kernel_b200(data):
+    input_tensor, mask, weights, config = data
+    bs, s1, s2, d = input_tensor.shape
+    if s1 < 800:
+        return small_kernel_pt_path(data)
+    H = config["hidden_dim"]
+    W_4way = pack_w_4way_efficient(weights)
+    W_og = get_w_og(weights)
+    M = bs * s1 * s2
+    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16)
+    return compiledtrimul_fused_interleaved_final(
+        x=input_tensor.to(torch.float32),
+        mask_mh=mask_mh,
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        W_4way=W_4way,
+        W_og=W_og,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=H,
+    )

build/torch-cuda/triton_h100.py ADDED Viewed

	@@ -0,0 +1,509 @@

+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16},  num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=5),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=4, num_stages=5),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=2, num_stages=4),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def fused_ln_dual_matmul_kernel(
+    # Pointers (9)
+    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
+    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
+    # Metadata (5)
+    M, H, K, s1, s2,
+    # Strides (16)
+    stride_x_m, stride_x_k,
+    stride_w4_k, stride_w4_n,
+    stride_wog_k, stride_wog_n,
+    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
+    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
+    stride_og_m, stride_og_h,
+    stride_mask_m, stride_mask_h,
+    # Constexpr (from decorator and kwargs)
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
+):
+    # --- PID Mapping: Based on the LARGER 4*H problem ---
+    pid = tl.program_id(axis=0)
+    N_4way = 4 * H
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # --- SHARED LayerNorm calculation (done only ONCE) ---
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    m_mask = offs_m < M
+    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
+    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        mean += tl.sum(x_chunk, axis=1)
+    mean /= K
+    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        x_centered = x_chunk - mean[:, None]
+        var += tl.sum(x_centered * x_centered, axis=1)
+    var /= K
+    rstd = 1.0 / tl.sqrt(var + LN_EPS)
+    # --- Matmul Loop 1: For the 4-Way Projections ---
+    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
+    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_block_start = k * BLOCK_SIZE_K;
+        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
+        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
+        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
+        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
+        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
+        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
+        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
+        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
+        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+        accumulator_4way += tl.dot(x_norm_tile, w_tile)
+        #Some threads should calclate out_gate
+        if pid_n * BLOCK_SIZE_N < H:
+            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
+            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
+            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
+            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+            accumulator_og += tl.dot(x_norm_tile, w_tile)
+    if pid_n * BLOCK_SIZE_N < H:
+        og_out = tl.sigmoid(accumulator_og)
+        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
+        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
+        tl.store(outg_ptrs, og_out, mask=og_mask)
+    # --- Fusion Logic for 4-Way Part ---
+    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
+    role_idx = tl.arange(0, 4)[None, None, :]
+    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
+    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
+    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
+    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
+    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
+    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
+    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
+    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
+    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
+    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
+    s1s2 = s1 * s2
+    offs_b  = offs_m // s1s2
+    offs_s1 = (offs_m % s1s2) // s2
+    offs_s2 = offs_m % s2
+    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
+    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
+    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
+    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
+    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
+                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
+    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
+                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1) # s2 offset uses s2 stride, s1 offset uses s1 stride
+    tl.store(outl_ptrs, left_out, mask=m_mask_h)
+    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+    ],
+    key=['s1', 's2', 'H'],
+)
+@triton.jit
+def bmm_coalesced_kernel(
+    # Pointers
+    Left_ptr, Right_ptr, Out_ptr,
+    # Dimensions
+    bs, s1, s2, H,
+    # Strides
+    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
+    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
+    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
+    # Kernel parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # Grid and program IDs
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_bh = tl.program_id(axis=1)
+    pid_b = pid_bh // H
+    pid_h = pid_bh % H
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
+    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
+        k_start = k * BLOCK_SIZE_K
+        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
+        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
+        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
+        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        accumulator += tl.dot(a, b)
+    # --- Coalesced Write ---
+    # Write to a standard (bs, H, s1, s1) layout
+    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
+               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
+    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
+    tl.store(out_ptrs, accumulator, mask=c_mask)
+@torch.compile
+def torch_pt2(left_final, right_final_t, bs, s1, s2, d, h, to_out_norm_weight, to_out_norm_bias, og_mh, to_out_weight):
+    bmm_out = torch.matmul(left_final, right_final_t)
+    out_einsum_flat = bmm_out.permute(0, 2, 3, 1).reshape(bs * s1 * s1, h)
+    # Apply layer norm and final gating
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * og_mh
+    # Final projection
+    final_out_flat = gated @ to_out_weight.t()
+    final_out = final_out_flat.view(bs, s1, s2, d)
+    return final_out
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
+    ],
+    key=['H', 'D'],
+)
+@triton.jit
+def fused_final_kernel(
+    # Pointers
+    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
+    # Metadata
+    M, H, D, s1, # M_gate = bs*s1*s2
+    # Strides
+    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
+    stride_gate_m, stride_gate_h,
+    stride_proj_d, stride_proj_h,
+    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
+    # Constants
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # --- Grid and PID Setup for Matmul ---
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    m_mask = offs_m < M
+    # Decompose M back to (b, r, c) for reordering lookups
+    s1s1 = s1 * s1
+    b = offs_m // s1s1
+    r = (offs_m % s1s1) // s1
+    c = offs_m % s1
+    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
+        # Accumulate sum and sum of squares in one pass
+        sum_x += tl.sum(in_chunk, axis=1)
+        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
+    # Finalize statistics
+    mean = sum_x / H
+    var = (sum_x2 / H) - (mean * mean)
+    rstd = tl.math.rsqrt(var + LN_EPS)
+    # --- Pass 3: Fused Gating and Matmul ---
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_norm = (a - mean[:, None]) * rstd[:, None]
+        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
+        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
+        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
+        proj_ptrs = ProjW_ptr + \
+                    offs_n[None, :] * stride_proj_d + \
+                    offs_k[:, None] * stride_proj_h
+        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
+        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_gated = a_norm * gate
+        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
+        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
+    # --- Store Final Output ---
+    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
+    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
+    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
+def compiledtrimul_fused_interleaved(
+    x: torch.Tensor,
+    mask_mh: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    W_4way: torch.Tensor, # Use the new weight matrices
+    W_og: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int,
+):
+    bs, s1, s2, d = x.shape
+    M, K, H = bs * s1 * s2, x.shape[-1], h
+    x_flat = x.view(M, K)
+    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
+    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
+    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
+    # The grid is launched for the larger 4*H problem
+    N_4way = 4 * H
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
+    fused_ln_dual_matmul_kernel[grid](
+        # Pointers (9)
+        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
+        left_final, right_final_t, og_mh,
+        # Metadata (5) - M, H, K, s1, s2
+        M, H, K, s1, s2,
+        # Strides (16)
+        x_flat.stride(0), x_flat.stride(1),
+        W_4way.stride(0), W_4way.stride(1),
+        W_og.stride(0), W_og.stride(1),
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        og_mh.stride(0), og_mh.stride(1),
+        mask_mh.stride(0), mask_mh.stride(1),
+        # Constexpr (1)
+        LN_EPS=1e-5
+    )
+    return torch_pt2(
+        left_final, right_final_t,
+        bs=bs,
+        s1=s1,
+        s2=s2,
+        d=d,
+        h=h,
+        to_out_norm_weight=to_out_norm_weight,
+        to_out_norm_bias=to_out_norm_bias,
+        og_mh=og_mh,
+        to_out_weight=to_out_weight
+    )
+def pack_w_4way_efficient(weights):
+    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
+    WL = weights['left_proj.weight']
+    WLG = weights['left_gate.weight']
+    WR = weights['right_proj.weight']
+    WRG = weights['right_gate.weight']
+    H, K = WL.shape
+    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2)
+    ws = ws.contiguous().view(4 * H, K)
+    return ws.t().to(torch.float16)
+def get_w_og(weights):
+    """ Gets the transposed [K, H] out_gate weight matrix. """
+    WOG = weights['out_gate.weight']
+    return WOG.t().to(torch.float16)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@torch.compile
+def compiledtrimul(
+    x: torch.Tensor,
+    mask: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    w_concat: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int
+) -> torch.Tensor:
+    """
+    A barebones, compiled PyTorch function for the TriMul logic.
+    """
+    bs, s1, s2, d = x.shape
+    # Initial LayerNorm
+    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
+    # Single large matmul: [M, d] @ [d, 5h] = [M, 5h]
+    all_projections = torch.mm(x_norm, w_concat)
+    # Split back into individual projections
+    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
+    # Apply mask and gates
+    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
+    left = left * mask_expanded * torch.sigmoid(lg)
+    right = right * mask_expanded * torch.sigmoid(rg)
+    out_gate = torch.sigmoid(og)
+    # Reshape for einsum
+    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
+    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
+    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
+    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
+    # Apply layer norm and final gating
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * out_gate
+    # Final projection
+    final_out_flat = gated @ to_out_weight.t()
+    final_out = final_out_flat.view(bs, s1, s2, d)
+    return final_out
+def small_kernel_pt_path(data):
+    input_tensor, mask, weights, config = data
+    w_concat = torch.cat([
+        weights['left_proj.weight'],
+        weights['right_proj.weight'],
+        weights['left_gate.weight'],
+        weights['right_gate.weight'],
+        weights['out_gate.weight']
+    ], dim=0).t().contiguous().to(torch.float16)
+    # Call the compiled function with prepared weights
+    output = compiledtrimul(
+        x=input_tensor.to(torch.float32),
+        mask=mask.unsqueeze(-1),
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        w_concat=w_concat,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float32),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float32),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=config["hidden_dim"]
+    )
+    return output
+def kernel_h100(data):
+    input_tensor, mask, weights, config = data
+    bs, s1, s2, d = input_tensor.shape
+    if s1 <= 512:
+        return small_kernel_pt_path(data)
+    H = config["hidden_dim"]
+    W_4way = pack_w_4way_efficient(weights)
+    W_og = get_w_og(weights)
+    M = bs * s1 * s2
+    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16) #move into kernel possibly
+    return compiledtrimul_fused_interleaved(
+        x=input_tensor.to(torch.float32),
+        mask_mh=mask_mh,
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        W_4way=W_4way, # Pass the new 4-way matrix
+        W_og=W_og,     # Pass the new out_gate matrix
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=H,
+    )

build/torch-rocm/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .triton_a100 import kernel_a100
+from .triton_h100 import kernel_h100
+from .triton_b200 import kernel_b200
+from .trimul_mi300 import kernel_mi300
+from .trimul_global import kernel_global
+__all__ = ["kernel_a100", "kernel_h100", "kernel_b200", "kernel_mi300", "kernel_global"]

build/torch-rocm/_ops.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+ops = torch.ops._trimul_gpumode_176b4e4
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_trimul_gpumode_176b4e4::{op_name}"

build/torch-rocm/metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"python-depends":[]}

build/torch-rocm/task.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""
+Type definitions for TriMul task.
+Input: Tuple of (input_tensor, mask, weights, config)
+  - input_tensor: Input tensor of shape [batch_size, seq_len, seq_len, dim]
+  - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
+  - weights: Dictionary containing model weights
+  - config: Dictionary containing model configuration parameters
+Output: Output tensor of shape [batch_size, seq_len, seq_len, dim]
+"""
+import torch
+from typing import Tuple, Dict, Any
+# Input type: (input_tensor, mask, weights, config)
+input_t = Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], Dict[str, Any]]
+# Output type: output tensor
+output_t = torch.Tensor

build/torch-rocm/trimul_global.py ADDED Viewed

	@@ -0,0 +1,971 @@

+# from utils import make_match_reference, DisableCuDNNTF32
+from .task import input_t, output_t
+import torch
+from torch import nn, einsum
+import math
+import os
+import requests
+import triton
+import triton.language as tl
+# The flag below controls whether to allow TF32 on matmul. This flag defaults to False
+# in PyTorch 1.12 and later.
+torch.backends.cuda.matmul.allow_tf32 = True
+# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
+torch.backends.cudnn.allow_tf32 = True
+# Set allocator for TMA descriptors (required for on-device TMA)
+def alloc_fn(size: int, alignment: int, stream=None):
+    return torch.empty(size, device="cuda", dtype=torch.int8)
+triton.set_allocator(alloc_fn)
+# os.environ['TRITON_PRINT_AUTOTUNING'] = '1'
+# os.environ['MLIR_ENABLE_DIAGNOSTICS'] = 'warnings,remarks'
+# Reference code in PyTorch
+class TriMul(nn.Module):
+    # Based on https://github.com/lucidrains/triangle-multiplicative-module/blob/main/triangle_multiplicative_module/triangle_multiplicative_module.py
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+    ):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.left_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.to_out_norm = nn.LayerNorm(hidden_dim)
+        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """
+        x: [bs, seq_len, seq_len, dim]
+        mask: [bs, seq_len, seq_len]
+        Returns:
+            output: [bs, seq_len, seq_len, dim]
+        """
+        batch_size, seq_len, _, dim = x.shape
+        x = self.norm(x)
+        left = self.left_proj(x)
+        right = self.right_proj(x)
+        mask = mask.unsqueeze(-1)
+        left = left * mask
+        right = right * mask
+        left_gate = self.left_gate(x).sigmoid()
+        right_gate = self.right_gate(x).sigmoid()
+        out_gate = self.out_gate(x).sigmoid()
+        left = left * left_gate
+        right = right * right_gate
+        out = einsum('... i k d, ... j k d -> ... i j d', left, right)
+        # This einsum is the same as the following:
+        # out = torch.zeros(batch_size, seq_len, seq_len, dim, device=x.device)
+        # # Compute using nested loops
+        # for b in range(batch_size):
+        #     for i in range(seq_len):
+        #         for j in range(seq_len):
+        #             # Compute each output element
+        #             for k in range(seq_len):
+        #                 out[b, i, j] += left[b, i, k, :] * right[b, j, k, :]
+        out = self.to_out_norm(out)
+        out = out * out_gate
+        return self.to_out(out)
+@triton.jit
+def triton_sigmoid(x):
+    """
+    Compute sigmoid function: 1 / (1 + exp(-x))
+    """
+    return 1.0 / (1.0 + tl.exp(-x))
+def two_mm_kernel_configs_wrapper():
+    if torch.cuda.get_device_capability() == (12, 0):
+        def two_mm_kernel_configs():
+            configs = []
+            for BLOCK_M in [16, 32]:
+                for BLOCK_N in [16, 32, 64]:
+                    for BLOCK_K in [16, 32, 64]:
+                        for num_stages in [2, 3]:
+                            configs.append(triton.Config({
+                                'BLOCK_M': BLOCK_M,
+                                'BLOCK_N': BLOCK_N,
+                                'BLOCK_K': BLOCK_K,
+                                'GROUP_SIZE_M': 8
+                            }, num_stages=num_stages, num_warps=8))
+            return configs
+    elif torch.cuda.get_device_capability()[0] == 9:
+        def get_optimal_two_mm_config_h100(B, seq_len, dim):
+            configs = {
+                (1, 128, 128): (128, 64, 128, 2, 8),
+                (1, 128, 256): (128, 64, 128, 2, 8),
+                (1, 128, 384): (128, 64, 64, 3, 8),
+                (1, 128, 512): (128, 64, 64, 3, 8),
+                (1, 128, 768): (128, 64, 64, 3, 8),
+                (1, 128, 1024): (128, 64, 64, 3, 8),
+                (1, 256, 128): (128, 64, 128, 2, 8),
+                (1, 256, 256): (128, 64, 128, 2, 8),
+                (1, 256, 384): (128, 64, 64, 3, 8),
+                (1, 256, 512): (128, 64, 64, 3, 8),
+                (1, 256, 768): (128, 64, 64, 3, 8),
+                (1, 256, 1024): (128, 64, 64, 3, 8),
+                (1, 512, 128): (128, 64, 128, 2, 8),
+                (1, 512, 256): (128, 64, 128, 2, 8),
+                (1, 512, 384): (128, 64, 128, 2, 8),
+                (1, 512, 512): (128, 64, 128, 2, 8),
+                (1, 512, 768): (128, 64, 64, 3, 8),
+                (1, 512, 1024): (128, 64, 64, 3, 8),
+                (1, 1024, 128): (128, 64, 128, 2, 8),
+                (1, 1024, 256): (128, 64, 64, 2, 8),
+                (1, 1024, 384): (128, 64, 128, 2, 8),
+                (1, 1024, 512): (128, 64, 128, 2, 8),
+                (1, 1024, 768): (128, 64, 128, 2, 8),
+                (1, 1024, 1024): (128, 64, 128, 2, 8),
+                (2, 128, 128): (128, 64, 128, 2, 8),
+                (2, 128, 256): (128, 64, 128, 2, 8),
+                (2, 128, 384): (128, 64, 64, 3, 8),
+                (2, 128, 512): (128, 64, 64, 3, 8),
+                (2, 128, 768): (128, 64, 64, 3, 8),
+                (2, 128, 1024): (128, 64, 64, 3, 8),
+                (2, 256, 128): (128, 64, 128, 2, 8),
+                (2, 256, 256): (128, 64, 128, 2, 8),
+                (2, 256, 384): (128, 64, 128, 2, 8),
+                (2, 256, 512): (128, 64, 128, 2, 8),
+                (2, 256, 768): (128, 64, 64, 3, 8),
+                (2, 256, 1024): (128, 64, 64, 3, 8),
+                (2, 512, 128): (128, 64, 128, 2, 8),
+                (2, 512, 256): (128, 64, 128, 2, 8),
+                (2, 512, 384): (128, 64, 128, 2, 8),
+                (2, 512, 512): (128, 64, 128, 2, 8),
+                (2, 512, 768): (128, 64, 128, 2, 8),
+                (2, 512, 1024): (128, 64, 128, 2, 8),
+                (2, 1024, 128): (128, 64, 128, 2, 8),
+                (2, 1024, 256): (128, 64, 128, 2, 8),
+                (2, 1024, 384): (128, 64, 128, 2, 8),
+                (2, 1024, 512): (128, 64, 128, 2, 8),
+                (2, 1024, 768): (128, 64, 128, 2, 8),
+                (2, 1024, 1024): (128, 64, 128, 2, 8),
+            }
+            return configs.get((B, seq_len, dim), (64, 64, 32, 2, 8))  # default fallback
+        def two_mm_kernel_configs():
+            # This function is kept for compatibility but will be overridden for H100
+            return [
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
+            ]
+    elif torch.cuda.get_device_capability()[0] == 10 and False:
+        def get_optimal_two_mm_config(B, seq_len, dim):
+            configs = {
+                (1, 128, 128): (64, 128, 64, 2, 8),
+                (1, 128, 256): (128, 64, 128, 2, 8),
+                (1, 128, 384): (128, 64, 128, 2, 8),
+                (1, 128, 512): (128, 64, 128, 2, 8),
+                (1, 128, 768): (128, 64, 64, 3, 8),
+                (1, 128, 1024): (128, 64, 64, 3, 8),
+                (1, 256, 128): (128, 64, 128, 2, 8),
+                (1, 256, 256): (128, 64, 128, 2, 8),
+                (1, 256, 384): (128, 64, 128, 2, 8),
+                (1, 256, 512): (128, 64, 64, 3, 8),
+                (1, 256, 768): (128, 64, 64, 3, 8),
+                (1, 256, 1024): (128, 64, 64, 3, 8),
+                (1, 512, 128): (128, 64, 128, 2, 8),
+                (1, 512, 256): (128, 64, 128, 2, 8),
+                (1, 512, 384): (128, 64, 128, 2, 8),
+                (1, 512, 512): (128, 64, 128, 2, 8),
+                (1, 512, 768): (128, 64, 64, 3, 8),
+                (1, 512, 1024): (128, 64, 64, 3, 8),
+                (1, 1024, 128): (128, 64, 128, 2, 8),
+                (1, 1024, 256): (128, 64, 128, 2, 8),
+                (1, 1024, 384): (128, 64, 128, 2, 8),
+                (1, 1024, 512): (128, 64, 128, 2, 8),
+                (1, 1024, 768): (128, 64, 64, 3, 8),
+                (1, 1024, 1024): (128, 64, 64, 3, 8),
+                (2, 128, 128): (128, 64, 128, 2, 8),
+                (2, 128, 256): (128, 64, 128, 2, 8),
+                (2, 128, 384): (128, 64, 128, 2, 8),
+                (2, 128, 512): (128, 64, 64, 3, 8),
+                (2, 128, 768): (128, 64, 64, 3, 8),
+                (2, 128, 1024): (128, 64, 64, 3, 8),
+                (2, 256, 128): (128, 64, 128, 2, 8),
+                (2, 256, 256): (128, 64, 128, 2, 8),
+                (2, 256, 384): (128, 64, 128, 2, 8),
+                (2, 256, 512): (128, 64, 64, 3, 8),
+                (2, 256, 768): (128, 64, 64, 3, 8),
+                (2, 256, 1024): (128, 64, 64, 3, 8),
+                (2, 512, 128): (128, 64, 128, 2, 8),
+                (2, 512, 256): (128, 64, 128, 2, 8),
+                (2, 512, 384): (128, 64, 128, 2, 8),
+                (2, 512, 512): (128, 64, 128, 2, 8),
+                (2, 512, 768): (128, 64, 64, 3, 8),
+                (2, 512, 1024): (128, 64, 64, 3, 8),
+                (2, 1024, 128): (128, 64, 128, 2, 8),
+                (2, 1024, 256): (128, 64, 128, 2, 8),
+                (2, 1024, 384): (128, 64, 128, 2, 8),
+                (2, 1024, 512): (128, 64, 128, 2, 8),
+                (2, 1024, 768): (128, 64, 64, 3, 8),
+                (2, 1024, 1024): (128, 64, 64, 3, 8),
+            }
+            return configs.get((B, seq_len, dim), (64, 64, 32, 2, 8))  # default fallback
+        def two_mm_kernel_configs():
+            # This function is kept for compatibility but will be overridden
+            return [
+                triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
+            ]
+    elif torch.cuda.get_device_capability()[0] == 8:
+        # A100
+        def two_mm_kernel_configs():
+            configs = []
+            for BLOCK_M in [64]:
+                for BLOCK_N in [64, 128]:
+                    for BLOCK_K in [16]:
+                        for num_stages in [3, 4]:
+                            for num_warps in [4, 8]:
+                                configs.append(triton.Config({
+                                    'BLOCK_M': BLOCK_M,
+                                    'BLOCK_N': BLOCK_N,
+                                    'BLOCK_K': BLOCK_K,
+                                    'GROUP_SIZE_M': 8
+                                }, num_stages=num_stages, num_warps=num_warps))
+            return configs
+    else:
+        def two_mm_kernel_configs():
+            configs = []
+            for BLOCK_M in [64, 128]:
+                for BLOCK_N in [64, 128]:
+                    for BLOCK_K in [64, 128]:
+                        for num_stages in [2, 3]:
+                            configs.append(triton.Config({
+                                'BLOCK_M': BLOCK_M,
+                                'BLOCK_N': BLOCK_N,
+                                'BLOCK_K': BLOCK_K,
+                                'GROUP_SIZE_M': 8
+                            }, num_stages=num_stages, num_warps=8))
+            return configs
+    return two_mm_kernel_configs
+def two_mm_kernel_wrapper():
+    if torch.cuda.get_device_capability()[0] == 8:
+        @triton.jit
+        def two_mm_kernel(a_ptr, b1_ptr, b2_ptr, b3_ptr, b4_ptr, b5_ptr, c1_ptr, c2_ptr, d_ptr, mask_ptr, M, N, K, stride_a0, stride_a1, stride_a2, stride_a3, stride_bk, stride_bn, stride_c0, stride_c1, stride_c2, stride_c3, seq_len, stride_d0, stride_d1, stride_d2, stride_d3, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
+            # Persistent kernel using standard tl.load operations
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            k_tiles = tl.cdiv(K, BLOCK_K)
+            num_tiles = num_pid_m * num_pid_n
+            # tile_id_c is used in the epilogue to break the dependency between
+            # the prologue and the epilogue
+            tile_id_c = start_pid - NUM_SMS
+            num_pid_in_group = GROUP_SIZE_M * num_pid_n
+            # Persistent loop over tiles
+            for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=False):
+                # Calculate PID for this tile using improved swizzling
+                group_id = tile_id // num_pid_in_group
+                first_pid_m = group_id * GROUP_SIZE_M
+                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+                pid_m = first_pid_m + (tile_id % group_size_m)
+                pid_n = (tile_id % num_pid_in_group) // group_size_m
+                # Calculate block offsets
+                offs_am = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_bn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+                offs_k = tl.arange(0, BLOCK_K)
+                # Initialize accumulators for all outputs
+                accumulator1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator3 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator4 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator_d = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                # Main computation loop over K dimension
+                for ki in range(k_tiles):
+                    k_start = ki * BLOCK_K
+                    k_offsets = k_start + offs_k
+                    # Create pointers for A matrix (2D flattened view)
+                    a_ptrs = a_ptr + offs_am[:, None] * stride_a2 + k_offsets[None, :] * stride_a3
+                    a_mask = (offs_am[:, None] < M) & (k_offsets[None, :] < K)
+                    # Create pointers for B matrices [N, K] layout
+                    b1_ptrs = b1_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b2_ptrs = b2_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b3_ptrs = b3_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b4_ptrs = b4_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b5_ptrs = b5_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b_mask = (offs_bn[:, None] < N) & (k_offsets[None, :] < K)
+                    # Load blocks from A and all weight matrices using standard tl.load
+                    a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+                    b1 = tl.load(b1_ptrs, mask=b_mask, other=0.0)
+                    b2 = tl.load(b2_ptrs, mask=b_mask, other=0.0)
+                    b3 = tl.load(b3_ptrs, mask=b_mask, other=0.0)
+                    b4 = tl.load(b4_ptrs, mask=b_mask, other=0.0)
+                    b5 = tl.load(b5_ptrs, mask=b_mask, other=0.0)
+                    # Perform matrix multiplications using TF32
+                    accumulator1 = tl.dot(a, b1.T, accumulator1, allow_tf32=True)  # A @ B1.T
+                    accumulator2 = tl.dot(a, b2.T, accumulator2, allow_tf32=True)  # A @ B2.T
+                    accumulator3 = tl.dot(a, b3.T, accumulator3, allow_tf32=True)  # A @ B3.T
+                    accumulator4 = tl.dot(a, b4.T, accumulator4, allow_tf32=True)  # A @ B4.T
+                    accumulator_d = tl.dot(a, b5.T, accumulator_d, allow_tf32=True)  # A @ B5.T
+                # Store results using separate tile_id_c for epilogue
+                tile_id_c += NUM_SMS
+                group_id = tile_id_c // num_pid_in_group
+                first_pid_m = group_id * GROUP_SIZE_M
+                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+                pid_m = first_pid_m + (tile_id_c % group_size_m)
+                pid_n = (tile_id_c % num_pid_in_group) // group_size_m
+                # Calculate output offsets and pointers
+                offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+                # Create masks for bounds checking
+                d_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+                # Calculate pointer addresses using 4D strides
+                stride_cm = stride_c2  # Stride to next element in flattened M dimension
+                stride_cn = stride_c3  # N is the innermost dimension
+                # For D tensor: use separate D strides
+                stride_dm = stride_d2  # Stride to next element in flattened M dimension
+                stride_dn = stride_d3  # N is the innermost dimension
+                off_c_batch = offs_cm // (seq_len * seq_len)
+                off_c_sl1 = (offs_cm // seq_len) % seq_len
+                off_c_sl2 = offs_cm % seq_len
+                off_c_dim = offs_cn
+                c_offsets = (off_c_batch * stride_c0 + off_c_sl1 * stride_c1 + off_c_sl2 * stride_c2)[:, None] + off_c_dim[None, :] * stride_c3
+                c_mask = d_mask
+                c1_ptrs = c1_ptr + c_offsets
+                c2_ptrs = c2_ptr + c_offsets
+                d_ptrs = d_ptr + stride_dm * offs_cm[:, None] + stride_dn * offs_cn[None, :]
+                mask = tl.load(mask_ptr + offs_cm, mask=(offs_cm < M))
+                # Broadcast mask to match accumulator dimensions [BLOCK_M, BLOCK_N]
+                mask_2d = mask[:, None]  # Convert to [BLOCK_M, 1] then broadcast
+                # Apply masking only to left_proj and right_proj results (C1, C2)
+                accumulator1 = tl.where(mask_2d, accumulator1, 0)
+                accumulator2 = tl.where(mask_2d, accumulator2, 0)
+                # Apply sigmoid to gate values
+                left_gate_sigmoid = triton_sigmoid(accumulator3)
+                right_gate_sigmoid = triton_sigmoid(accumulator4)
+                accumulator_d = triton_sigmoid(accumulator_d)
+                # Apply elementwise multiplication with gated values
+                # C1 = left * left_gate, C2 = right * right_gate
+                accumulator1 = accumulator1 * left_gate_sigmoid  # left * left_gate
+                accumulator2 = accumulator2 * right_gate_sigmoid  # right * right_gate
+                # Convert to appropriate output dtype and store with normal tl.store
+                c1 = accumulator1.to(c1_ptr.dtype.element_ty)
+                c2 = accumulator2.to(c2_ptr.dtype.element_ty)
+                d = accumulator_d.to(d_ptr.dtype.element_ty)
+                tl.store(c1_ptrs, c1, mask=c_mask)
+                tl.store(c2_ptrs, c2, mask=c_mask)
+                tl.store(d_ptrs, d, mask=d_mask)
+    else:
+        @triton.jit
+        def two_mm_kernel(a_ptr, b1_ptr, b2_ptr, b3_ptr, b4_ptr, b5_ptr, c1_ptr, c2_ptr, d_ptr, mask_ptr, M, N, K, stride_a0, stride_a1, stride_a2, stride_a3, stride_bk, stride_bn, stride_c0, stride_c1, stride_c2, stride_c3, seq_len, stride_d0, stride_d1, stride_d2, stride_d3, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
+            # Persistent kernel using on-device TMA descriptors
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            k_tiles = tl.cdiv(K, BLOCK_K)
+            num_tiles = num_pid_m * num_pid_n
+            # Create on-device TMA descriptors
+            a_desc = tl._experimental_make_tensor_descriptor(
+                a_ptr,
+                shape=[M, K],
+                strides=[stride_a2, stride_a3],
+                block_shape=[BLOCK_M, BLOCK_K],
+            )
+            b1_desc = tl._experimental_make_tensor_descriptor(
+                b1_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            b2_desc = tl._experimental_make_tensor_descriptor(
+                b2_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            b3_desc = tl._experimental_make_tensor_descriptor(
+                b3_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            b4_desc = tl._experimental_make_tensor_descriptor(
+                b4_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            b5_desc = tl._experimental_make_tensor_descriptor(
+                b5_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            # tile_id_c is used in the epilogue to break the dependency between
+            # the prologue and the epilogue
+            tile_id_c = start_pid - NUM_SMS
+            num_pid_in_group = GROUP_SIZE_M * num_pid_n
+            # Persistent loop over tiles
+            for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=False):
+                # Calculate PID for this tile using improved swizzling
+                group_id = tile_id // num_pid_in_group
+                first_pid_m = group_id * GROUP_SIZE_M
+                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+                pid_m = first_pid_m + (tile_id % group_size_m)
+                pid_n = (tile_id % num_pid_in_group) // group_size_m
+                # Calculate block offsets
+                offs_am = pid_m * BLOCK_M
+                offs_bn = pid_n * BLOCK_N
+                # Initialize accumulators for all outputs
+                accumulator1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator3 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator4 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator_d = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                # Main computation loop over K dimension
+                for ki in range(k_tiles):
+                    offs_k = ki * BLOCK_K
+                    # Load blocks from A and all weight matrices using on-device TMA
+                    a = a_desc.load([offs_am, offs_k])
+                    b1 = b1_desc.load([offs_bn, offs_k])
+                    b2 = b2_desc.load([offs_bn, offs_k])
+                    b3 = b3_desc.load([offs_bn, offs_k])
+                    b4 = b4_desc.load([offs_bn, offs_k])
+                    b5 = b5_desc.load([offs_bn, offs_k])
+                    # Perform matrix multiplications using TF32
+                    accumulator1 = tl.dot(a, b1.T, accumulator1, allow_tf32=True)  # A @ B1.T
+                    accumulator2 = tl.dot(a, b2.T, accumulator2, allow_tf32=True)  # A @ B2.T
+                    accumulator3 = tl.dot(a, b3.T, accumulator3, allow_tf32=True)  # A @ B3.T
+                    accumulator4 = tl.dot(a, b4.T, accumulator4, allow_tf32=True)  # A @ B4.T
+                    accumulator_d = tl.dot(a, b5.T, accumulator_d, allow_tf32=True)  # A @ B5.T
+                # Store results using separate tile_id_c for epilogue
+                tile_id_c += NUM_SMS
+                group_id = tile_id_c // num_pid_in_group
+                first_pid_m = group_id * GROUP_SIZE_M
+                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+                pid_m = first_pid_m + (tile_id_c % group_size_m)
+                pid_n = (tile_id_c % num_pid_in_group) // group_size_m
+                # Calculate output offsets and pointers
+                offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+                # Create masks for bounds checking
+                d_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+                # Calculate pointer addresses using 4D strides
+                # For C tensors: compute effective 2D strides from 4D strides
+                # Output tensor is [B, I, J, N], flattened to [M, N] where M = B*I*J
+                stride_cm = stride_c2  # Stride to next element in flattened M dimension
+                stride_cn = stride_c3  # N is the innermost dimension
+                # For D tensor: use separate D strides
+                stride_dm = stride_d2  # Stride to next element in flattened M dimension
+                stride_dn = stride_d3  # N is the innermost dimension
+                off_c_batch = offs_cm // (seq_len * seq_len)
+                off_c_sl1 = (offs_cm // seq_len) % seq_len
+                off_c_sl2 = offs_cm % seq_len
+                off_c_dim = offs_cn
+                # TODO update the mask_c so we don't IMA
+                c_offsets = (off_c_batch * stride_c0 + off_c_sl1 * stride_c1 + off_c_sl2 * stride_c2)[:, None] + off_c_dim[None, :] * stride_c3
+                # c_offsets = offs_cm[:, None] * stride_c2 + offs_cn[None, :] * stride_c3
+                c_mask = d_mask
+                c1_ptrs = c1_ptr + c_offsets
+                c2_ptrs = c2_ptr + c_offsets
+                # c1_ptrs = c1_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+                # c2_ptrs = c2_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+                d_ptrs = d_ptr + stride_dm * offs_cm[:, None] + stride_dn * offs_cn[None, :]
+                mask = tl.load(mask_ptr + offs_cm, mask=(offs_cm < M))
+                # Broadcast mask to match accumulator dimensions [BLOCK_M, BLOCK_N]
+                mask_2d = mask[:, None]  # Convert to [BLOCK_M, 1] then broadcast
+                # Apply masking only to left_proj and right_proj results (C1, C2)
+                accumulator1 = tl.where(mask_2d, accumulator1, 0)
+                accumulator2 = tl.where(mask_2d, accumulator2, 0)
+                # Apply sigmoid to gate values
+                left_gate_sigmoid = triton_sigmoid(accumulator3)
+                right_gate_sigmoid = triton_sigmoid(accumulator4)
+                accumulator_d = triton_sigmoid(accumulator_d)
+                # Apply elementwise multiplication with gated values
+                # C1 = left * left_gate, C2 = right * right_gate
+                accumulator1 = accumulator1 * left_gate_sigmoid  # left * left_gate
+                accumulator2 = accumulator2 * right_gate_sigmoid  # right * right_gate
+                # Convert to appropriate output dtype and store with normal tl.store
+                c1 = accumulator1.to(c1_ptr.dtype.element_ty)
+                c2 = accumulator2.to(c2_ptr.dtype.element_ty)
+                d = accumulator_d.to(d_ptr.dtype.element_ty)
+                tl.store(c1_ptrs, c1, mask=c_mask)
+                tl.store(c2_ptrs, c2, mask=c_mask)
+                tl.store(d_ptrs, d, mask=d_mask)
+    if torch.cuda.get_device_capability()[0] not in [9, 10.2]:
+        two_mm_kernel = triton.autotune(
+            (two_mm_kernel_configs_wrapper())(), key=["M", "N", "K"]
+        )(two_mm_kernel)
+    return two_mm_kernel
+def two_mm(A, left_proj, right_proj, left_gate, right_gate, out_gate, mask):
+    """
+    Persistent matrix multiplication for all weight matrices using on-device TMA descriptors.
+    Args:
+        A: [..., K] tensor (arbitrary leading dimensions)
+        left_proj: [N, K] matrix (will be transposed)
+        right_proj: [N, K] matrix (will be transposed)
+        left_gate: [N, K] left gate weight matrix
+        right_gate: [N, K] right gate weight matrix
+        out_gate: [N, K] output gate weight matrix
+        mask: mask tensor
+    Returns:
+        (C1, C2, D): Tuple of result tensors [..., N] with same leading dims as A
+            C1 = (A @ left_proj.T) * sigmoid(A @ left_gate.T) (masked)
+            C2 = (A @ right_proj.T) * sigmoid(A @ right_gate.T) (masked)
+            D = sigmoid(A @ out_gate.T) (unmasked)
+    """
+    # Check constraints
+    assert A.shape[-1] == left_proj.shape[1] == right_proj.shape[1], "Incompatible K dimensions"
+    assert A.dtype == left_proj.dtype == right_proj.dtype, "Incompatible dtypes"
+    # Assert that all weight matrices have the same strides (same [N, K] shape)
+    assert left_proj.stride() == right_proj.stride() == left_gate.stride() == right_gate.stride() == out_gate.stride(), \
+        "All weight matrices must have identical strides"
+    # Get dimensions
+    original_shape = A.shape[:-1]  # All dimensions except the last
+    K = A.shape[-1]
+    N = left_proj.shape[0]
+    B, seq_len, _, _ = A.shape
+    dtype = A.dtype
+    # Flatten A to 2D for kernel processing
+    A_2d = A.view(-1, K)  # [M, K] where M is product of all leading dims
+    M = A_2d.shape[0]
+    # Get number of streaming multiprocessors
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+    # Launch persistent kernel with limited number of blocks
+    grid = lambda META: (min(NUM_SMS, triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"])),)
+    # Get original 4D strides for A and output tensors
+    A_strides = A.stride()  # (stride_0, stride_1, stride_2, stride_3)
+    # Create output tensors with proper 4D shape to get correct strides
+    output_shape = original_shape + (N,)
+    # C1 = torch.empty(output_shape, device=A.device, dtype=dtype)
+    # C2 = torch.empty(output_shape, device=A.device, dtype=dtype)
+    C1 = torch.empty(B, N, seq_len, seq_len, device=A.device, dtype=torch.float16).permute(0, 2, 3, 1)
+    C2 = torch.empty(B, N, seq_len, seq_len, device=A.device, dtype=torch.float16).permute(0, 2, 3, 1)
+    D = torch.empty(output_shape, device=A.device, dtype=torch.float16)
+    C_strides = C1.stride()  # (stride_0, stride_1, stride_2, stride_3)
+    D_strides = D.stride()   # (stride_0, stride_1, stride_2, stride_3)
+    # Use optimal configuration for B200/H100 or fallback to autotuning for other GPUs
+    if torch.cuda.get_device_capability()[0] == 10:
+        # Get optimal configuration for B200
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps = (two_mm_kernel_configs_wrapper())(B, seq_len, K)
+        grid_size = min(NUM_SMS, triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N))
+        two_mm_kernel_wrapper()[(grid_size,)](
+            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
+            C1, C2, D, mask,
+            M, N, K,
+            *A_strides,  # 4D strides for A
+            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
+            *C_strides,  # 4D strides for C
+            seq_len,
+            *D_strides,  # 4D strides for D
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_SIZE_M=8, NUM_SMS=NUM_SMS,
+            num_stages=num_stages, num_warps=num_warps
+        )
+    elif torch.cuda.get_device_capability()[0] == 9:
+        # Get optimal configuration for H100
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps = (two_mm_kernel_configs_wrapper())(B, seq_len, K)
+        grid_size = min(NUM_SMS, triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N))
+        two_mm_kernel_wrapper()[(grid_size,)](
+            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
+            C1, C2, D, mask,
+            M, N, K,
+            *A_strides,  # 4D strides for A
+            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
+            *C_strides,  # 4D strides for C
+            seq_len,
+            *D_strides,  # 4D strides for D
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_SIZE_M=8, NUM_SMS=NUM_SMS,
+            num_stages=num_stages, num_warps=num_warps
+        )
+    else:
+        # Use autotuning for other GPUs
+        two_mm_kernel_wrapper()[grid](
+            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
+            C1, C2, D, mask,
+            M, N, K,
+            *A_strides,  # 4D strides for A
+            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
+            *C_strides,  # 4D strides for C
+            seq_len,
+            *D_strides,  # 4D strides for D
+            NUM_SMS=NUM_SMS
+        )
+    return C1, C2, D
+def second_layernorm_mul(inp, hidden_dim, weight, bias, mul_operand):
+    ln = torch.nn.functional.layer_norm(inp, (hidden_dim,), eps=1e-5, weight=weight.to(inp.dtype), bias=bias.to(inp.dtype))
+    out = ln * mul_operand
+    return out
+'''
+@triton.autotune(
+    [triton.Config({"ROW_BLOCK_SIZE": 16}, num_warps=4, num_stages=3)],
+    key=["R", "C"]
+)
+'''
+@triton.jit
+def layernorm_kernel_first(
+    X,
+    Y,
+    Weight,
+    Bias,
+    R,
+    C,  # aka "dim"
+    eps,
+    ROW_BLOCK_SIZE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row = tl.program_id(0) * ROW_BLOCK_SIZE + tl.arange(0, ROW_BLOCK_SIZE)
+    cols = tl.arange(0, BLOCK_SIZE)
+    mask_row = row < R
+    mask_col = cols < C
+    # Simple indexing for contiguous data
+    x = tl.load(
+        X + row[:, None] * C + cols[None, :],
+        mask=mask_row[:, None] & mask_col[None, :],
+        other=0.0
+    ).to(tl.float32)
+    weight = tl.load(Weight + cols, mask=mask_col, other=0.0).to(tl.float32)
+    bias = tl.load(Bias + cols, mask=mask_col, other=0.0).to(tl.float32)
+    mean = tl.sum(x, axis=1) / C
+    diff = tl.where(mask_row[:, None] & mask_col[None, :], x - mean[:, None], 0)
+    var = tl.sum(diff * diff, axis=1) / C
+    rstd = 1 / tl.sqrt(var + eps)
+    y_hat = (x - mean[:, None]) * rstd[:, None]
+    y = y_hat * weight[None, :] + bias[None, :]
+    tl.store(
+        Y + row[:, None] * C + cols[None, :],
+        y,
+        mask=mask_row[:, None] & mask_col[None, :]
+    )
+def get_optimal_config_ln(dim):
+    config = None
+    if torch.cuda.get_device_capability()[0] == 9:
+        if (dim <= 256):
+            config = (16, 1)
+        elif dim <= 512:
+            config = (16, 2)
+        elif dim <= 1024:
+            config = (16, 4)
+    if not config:
+        config = (16, 4)
+    return config
+def triton_layernorm_first(x, weight, bias, eps=1e-5, num_warps=None, ROW_BLOCK_SIZE=None):
+    B, seq_len, seq_len2, dim = x.shape
+    assert(seq_len == seq_len2)
+    R = B * seq_len * seq_len
+    C = dim
+    out = torch.empty_like(x, dtype=torch.float16)
+    if not num_warps or not ROW_BLOCK_SIZE:
+        ROW_BLOCK_SIZE, num_warps = get_optimal_config_ln(dim)
+    BLOCK_SIZE = triton.next_power_of_2(C)
+    assert(BLOCK_SIZE <= 1024)
+    def grid(meta):
+        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
+    layernorm_kernel_first[grid](
+        x, out, weight, bias,
+        R, C, eps,
+        ROW_BLOCK_SIZE=ROW_BLOCK_SIZE,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+        num_stages=3
+    )
+    return out
+'''
+def triton_layernorm_first(x, weight, bias, eps=1e-5):
+    B, seq_len, seq_len2, dim = x.shape
+    assert(seq_len == seq_len2)
+    R = B * seq_len * seq_len
+    C = dim
+    out = torch.empty_like(x)
+    BLOCK_SIZE = triton.next_power_of_2(C)
+    assert(BLOCK_SIZE <= 1024)
+    def grid(meta):
+        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
+    layernorm_kernel_first[grid](
+        x, out, weight, bias,
+        R, C, eps,
+        BLOCK_SIZE=BLOCK_SIZE
+    )
+    return out
+'''
+@triton.autotune(
+    [triton.Config({"ROW_BLOCK_SIZE": 16}, num_warps=1, num_stages=3)],
+    key=[]
+)
+@triton.jit
+def layernorm_kernel_eltwise(
+    X,
+    Y,
+    Weight,
+    Bias,
+    OutGate,
+    seq_len,
+    stride_batch,
+    stride_dim,
+    R,
+    C,  # aka "dim"
+    eps,
+    ROW_BLOCK_SIZE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row = tl.program_id(0) * ROW_BLOCK_SIZE + tl.arange(0, ROW_BLOCK_SIZE)
+    cols = tl.arange(0, BLOCK_SIZE)
+    # Calculate base pointer for this batch of rows
+    tl.device_assert(seq_len*seq_len % ROW_BLOCK_SIZE == 0)
+    # batch_offset = (row // (stride_seq1 // stride_dim)) * stride_batch
+    batch = tl.program_id(0) * ROW_BLOCK_SIZE // (seq_len * seq_len)
+    seqs_off = row % (seq_len * seq_len) # TODO is this going to prevent vectorization
+    off_r = batch * stride_batch + seqs_off
+    off_c = cols * stride_dim
+    mask_row = row < R
+    mask_col = cols < C
+    out_gate = tl.load(
+        OutGate + row[:, None] * C + cols[None, :],
+        mask = mask_row[:, None] & mask_col[None, :],
+    )
+    x = tl.load(
+        X + off_r[:, None] + off_c[None, :],
+        mask=mask_row[:, None] & mask_col[None, :],
+        other=0.0
+    ).to(tl.float32)
+    weight = tl.load(Weight + cols, mask=mask_col, other=0.0).to(tl.float32)
+    bias = tl.load(Bias + cols, mask=mask_col, other=0.0).to(tl.float32)
+    mean = tl.sum(x, axis=1) / C
+    diff = tl.where(mask_row[:, None] & mask_col[None, :], x - mean[:, None], 0)
+    var = tl.sum(diff * diff, axis=1) / C
+    rstd = 1 / tl.sqrt(var + eps)
+    y_hat = (x - mean[:, None]) * rstd[:, None]
+    y = y_hat * weight[None, :] + bias[None, :]
+    tl.store(
+        Y + row[:, None] * C + cols[None, :],
+        y * out_gate,
+        mask=mask_row[:, None] & mask_col[None, :]
+    )
+def triton_layernorm_eltwise(x, weight, bias, out_gate, eps=1e-5):
+    B, seq_len, seq_len2, dim = x.shape
+    assert(seq_len == seq_len2)
+    R = B * seq_len * seq_len
+    assert(x.stride(3) == seq_len*seq_len)
+    assert(out_gate.is_contiguous())
+    C = dim
+    out = torch.empty_like(out_gate, dtype=torch.float32)
+    BLOCK_SIZE = triton.next_power_of_2(C)
+    assert(BLOCK_SIZE == 128)
+    def grid(meta):
+        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
+    layernorm_kernel_eltwise[grid](
+        x, out, weight, bias, out_gate,
+        seq_len,
+        x.stride(0), x.stride(3),
+        R, C, eps,
+        BLOCK_SIZE=BLOCK_SIZE
+    )
+    return out
+def kernel_global(data: input_t) -> output_t:
+    """
+    Reference implementation of TriMul using PyTorch.
+    Args:
+        data: Tuple of (input: torch.Tensor, mask: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+            - input: Input tensor of shape [batch_size, seq_len, seq_len, dim]
+            - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
+            - weights: Dictionary containing model weights
+            - config: Dictionary containing model configuration parameters
+    """
+    input_tensor, mask, weights, config = data
+    left_proj_weight = weights["left_proj.weight"].to(torch.float16)
+    right_proj_weight = weights["right_proj.weight"].to(torch.float16)
+    left_gate_weight = weights["left_gate.weight"].to(torch.float16)
+    right_gate_weight = weights["right_gate.weight"].to(torch.float16)
+    out_gate_weight = weights["out_gate.weight"].to(torch.float16)
+    hidden_dim = config["hidden_dim"]
+    # trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
+    x = input_tensor
+    batch_size, seq_len, _, dim = x.shape
+    x = triton_layernorm_first(x, weights['norm.weight'], weights['norm.bias'])
+    # x = torch.nn.functional.layer_norm(x, (dim,), eps=1e-5, weight=weights['norm.weight'], bias=weights['norm.bias'])
+    left, right, out_gate = two_mm(x, left_proj_weight, right_proj_weight, left_gate_weight, right_gate_weight, out_gate_weight, mask)
+    # left = torch.nn.functional.linear(x, weights['left_proj.weight'].to(torch.float16))
+    # right = torch.nn.functional.linear(x, weights['right_proj.weight'].to(torch.float16))
+    # left = left * mask.unsqueeze(-1)
+    # right = right * mask.unsqueeze(-1)
+    '''
+    left = left.to(torch.float32)
+    right = right.to(torch.float32)
+    x = x.to(torch.float32)
+    left_gate = left_gate.sigmoid()
+    right_gate = right_gate.sigmoid()
+    out_gate = out_gate.sigmoid()
+    '''
+    # Elementwise multiplication now handled in kernel
+    # left = left * left_gate
+    # right = right * right_gate
+    # out = einsum('... i k d, ... j k d -> ... i j d', left, right)
+    out = torch.bmm(left.permute(0, 3, 1, 2).view(-1, left.shape[1], left.shape[2]), right.permute(0, 3, 2, 1).view(-1, right.shape[2], right.shape[1]))
+    out = out.view(batch_size, hidden_dim, seq_len, seq_len).permute(0, 2, 3, 1)
+    # out = torch.compile(second_layernorm_mul, dynamic=False)(out, hidden_dim, weights['to_out_norm.weight'], weights['to_out_norm.bias'], out_gate)
+    out = triton_layernorm_eltwise(out, weights['to_out_norm.weight'], weights['to_out_norm.bias'], out_gate)
+    # out = torch.nn.functional.layer_norm(out, (hidden_dim,), eps=1e-5, weight=weights['to_out_norm.weight'].to(out.dtype), bias=weights['to_out_norm.bias'].to(out.dtype))
+    # out = out * out_gate
+    return torch.nn.functional.linear(out, weights['to_out.weight'])
+    '''
+    # Fill in the given weights of the model
+    trimul.norm.weight = nn.Parameter(weights['norm.weight'])
+    trimul.norm.bias = nn.Parameter(weights['norm.bias'])
+    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
+    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
+    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
+    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
+    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
+    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
+    trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'])
+    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
+    output = trimul(input_tensor, mask)
+    return output
+    '''

build/torch-rocm/trimul_gpumode/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import sys
+import importlib
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch-rocm/trimul_mi300.py ADDED Viewed

	@@ -0,0 +1,524 @@

+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16},  num_warps=4, num_stages=2),
+        # Configurations with larger block sizes for better data reuse
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=2),
+        # Configurations with deeper K dimension
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=2),
+        # More extreme configurations to test the limits
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=4, num_stages=2),
+        # Configurations with fewer warps
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=2, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=3),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def fused_ln_dual_matmul_kernel(
+    # Pointers (9)
+    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
+    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
+    # Metadata (5)
+    M, H, K, s1, s2,
+    # Strides (16)
+    stride_x_m, stride_x_k,
+    stride_w4_k, stride_w4_n,
+    stride_wog_k, stride_wog_n,
+    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
+    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
+    stride_og_m, stride_og_h,
+    stride_mask_m, stride_mask_h,
+    # Constexpr (from decorator and kwargs)
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
+):
+    # --- PID Mapping: Based on the LARGER 4*H problem ---
+    pid = tl.program_id(axis=0)
+    N_4way = 4 * H
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # --- SHARED LayerNorm calculation (done only ONCE) ---
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    m_mask = offs_m < M
+    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
+    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        mean += tl.sum(x_chunk, axis=1)
+    mean /= K
+    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        x_centered = x_chunk - mean[:, None]
+        var += tl.sum(x_centered * x_centered, axis=1)
+    var /= K
+    rstd = 1.0 / tl.sqrt(var + LN_EPS)
+    # --- Matmul Loop 1: For the 4-Way Projections ---
+    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
+    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_block_start = k * BLOCK_SIZE_K;
+        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
+        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
+        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
+        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
+        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
+        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
+        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
+        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
+        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+        accumulator_4way += tl.dot(x_norm_tile, w_tile)
+        #Some threads should calclate out_gate
+        if pid_n * BLOCK_SIZE_N < H:
+            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
+            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
+            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
+            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+            accumulator_og += tl.dot(x_norm_tile, w_tile)
+    if pid_n * BLOCK_SIZE_N < H:
+        og_out = tl.sigmoid(accumulator_og)
+        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
+        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
+        tl.store(outg_ptrs, og_out, mask=og_mask)
+    # --- Fusion Logic for 4-Way Part ---
+    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
+    role_idx = tl.arange(0, 4)[None, None, :]
+    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
+    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
+    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
+    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
+    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
+    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
+    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
+    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
+    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
+    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
+    s1s2 = s1 * s2
+    offs_b  = offs_m // s1s2
+    offs_s1 = (offs_m % s1s2) // s2
+    offs_s2 = offs_m % s2
+    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
+    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
+    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
+    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
+    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
+                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
+    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
+                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1) # s2 offset uses s2 stride, s1 offset uses s1 stride
+    tl.store(outl_ptrs, left_out, mask=m_mask_h)
+    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
+    ],
+    key=['s1', 's2', 'H'],
+)
+@triton.jit
+def bmm_coalesced_kernel(
+    # Pointers
+    Left_ptr, Right_ptr, Out_ptr,
+    # Dimensions
+    bs, s1, s2, H,
+    # Strides
+    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
+    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
+    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
+    # Kernel parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # Grid and program IDs
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_bh = tl.program_id(axis=1)
+    pid_b = pid_bh // H
+    pid_h = pid_bh % H
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
+    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
+        k_start = k * BLOCK_SIZE_K
+        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
+        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
+        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
+        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        accumulator += tl.dot(a, b)
+    # --- Coalesced Write ---
+    # Write to a standard (bs, H, s1, s1) layout
+    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
+               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
+    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
+    tl.store(out_ptrs, accumulator, mask=c_mask)
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
+    ],
+    key=['H', 'D'],
+)
+@triton.jit
+def fused_final_kernel(
+    # Pointers
+    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
+    # Metadata
+    M, H, D, s1, # M_gate = bs*s1*s2
+    # Strides
+    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
+    stride_gate_m, stride_gate_h,
+    stride_proj_d, stride_proj_h,
+    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
+    # Constants
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # --- Grid and PID Setup for Matmul ---
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    m_mask = offs_m < M
+    # Decompose M back to (b, r, c) for reordering lookups
+    s1s1 = s1 * s1
+    b = offs_m // s1s1
+    r = (offs_m % s1s1) // s1
+    c = offs_m % s1
+    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
+        # Accumulate sum and sum of squares in one pass
+        sum_x += tl.sum(in_chunk, axis=1)
+        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
+    # Finalize statistics
+    mean = sum_x / H
+    var = (sum_x2 / H) - (mean * mean)
+    rstd = tl.math.rsqrt(var + LN_EPS)
+    # --- Pass 3: Fused Gating and Matmul ---
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_norm = (a - mean[:, None]) * rstd[:, None]
+        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
+        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
+        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
+        proj_ptrs = ProjW_ptr + \
+                    offs_n[None, :] * stride_proj_d + \
+                    offs_k[:, None] * stride_proj_h
+        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
+        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_gated = a_norm * gate
+        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
+        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
+    # --- Store Final Output ---
+    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
+    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
+    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
+def compiledtrimul_fused_interleaved(
+    x: torch.Tensor,
+    mask_mh: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    W_4way: torch.Tensor, # Use the new weight matrices
+    W_og: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int,
+):
+    bs, s1, s2, d = x.shape
+    M, K, H = bs * s1 * s2, x.shape[-1], h
+    x_flat = x.view(M, K)
+    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
+    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
+    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
+    # The grid is launched for the larger 4*H problem
+    N_4way = 4 * H
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
+    fused_ln_dual_matmul_kernel[grid](
+        # Pointers (9)
+        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
+        left_final, right_final_t, og_mh,
+        # Metadata (5) - M, H, K, s1, s2
+        M, H, K, s1, s2,
+        # Strides (16)
+        x_flat.stride(0), x_flat.stride(1),
+        W_4way.stride(0), W_4way.stride(1),
+        W_og.stride(0), W_og.stride(1),
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        og_mh.stride(0), og_mh.stride(1),
+        mask_mh.stride(0), mask_mh.stride(1),
+        # Constexpr (1)
+        LN_EPS=1e-5
+    )
+    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
+    grid_bmm = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
+    bmm_coalesced_kernel[grid_bmm](
+        left_final, right_final_t, bmm_out_tmp,
+        bs, s1, s2, H,
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+    )
+    # --- Kernel 3: Fully Fused Final Stage ---
+    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
+    grid_final = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
+    fused_final_kernel[grid_final](
+        # Pointers
+        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
+        # Metadata
+        M, H, d, s1,
+        # Strides
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        og_mh.stride(0), og_mh.stride(1),
+        to_out_weight.stride(0), to_out_weight.stride(1), # Use strides of the corrected tensor
+        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
+        # Constants
+        LN_EPS=1e-5,
+    )
+    return final_out
+def pack_w_4way_efficient(weights):
+    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
+    WL = weights['left_proj.weight']
+    WLG = weights['left_gate.weight']
+    WR = weights['right_proj.weight']
+    WRG = weights['right_gate.weight']
+    H, K = WL.shape
+    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2)
+    ws = ws.contiguous().view(4 * H, K)
+    return ws.t().to(torch.float16)
+def get_w_og(weights):
+    """ Gets the transposed [K, H] out_gate weight matrix. """
+    WOG = weights['out_gate.weight']
+    return WOG.t().to(torch.float16)
+def compiledtrimul(
+    x: torch.Tensor,
+    mask: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    w_concat: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int
+) -> torch.Tensor:
+    """
+    A barebones, compiled PyTorch function for the TriMul logic.
+    """
+    bs, s1, s2, d = x.shape
+    # Initial LayerNorm
+    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
+    # Single large matmul: [M, d] @ [d, 5h] = [M, 5h]
+    all_projections = torch.mm(x_norm, w_concat)
+    # Split back into individual projections
+    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
+    # Apply mask and gates
+    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
+    left = left * mask_expanded * torch.sigmoid(lg)
+    right = right * mask_expanded * torch.sigmoid(rg)
+    out_gate = torch.sigmoid(og)
+    # Reshape for einsum
+    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
+    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
+    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
+    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
+    # Apply layer norm and final gating
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * out_gate
+    # Final projection
+    final_out_flat = gated @ to_out_weight.t()
+    final_out = final_out_flat.view(bs, s1, s2, d)
+    return final_out
+def small_kernel_pt_path(data):
+    input_tensor, mask, weights, config = data
+    w_concat = torch.cat([
+        weights['left_proj.weight'],
+        weights['right_proj.weight'],
+        weights['left_gate.weight'],
+        weights['right_gate.weight'],
+        weights['out_gate.weight']
+    ], dim=0).t().contiguous().to(torch.float16)
+    # Call the compiled function with prepared weights
+    output = compiledtrimul(
+        x=input_tensor.to(torch.float32),
+        mask=mask.unsqueeze(-1),
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        w_concat=w_concat,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=config["hidden_dim"]
+    )
+    return output
+def kernel_mi300(data):
+    input_tensor, mask, weights, config = data
+    bs, s1, s2, d = input_tensor.shape
+    if s1 < 100:
+        return small_kernel_pt_path(data)
+    H = config["hidden_dim"]
+    W_4way = pack_w_4way_efficient(weights)
+    W_og = get_w_og(weights)
+    M = bs * s1 * s2
+    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16) #move into kernel possibly
+    return compiledtrimul_fused_interleaved(
+        x=input_tensor.to(torch.float32),
+        mask_mh=mask_mh,
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        W_4way=W_4way, # Pass the new 4-way matrix
+        W_og=W_og,     # Pass the new out_gate matrix
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=H,
+    )

build/torch-rocm/triton_a100.py ADDED Viewed

	@@ -0,0 +1,405 @@

+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+# Set PyTorch flags for performance
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@triton.jit
+def fused_ln_dual_matmul_kernel(
+    # Pointers (9)
+    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
+    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
+    # Metadata (5)
+    M, H, K, s1, s2,
+    # Strides (16)
+    stride_x_m, stride_x_k,
+    stride_w4_k, stride_w4_n,
+    stride_wog_k, stride_wog_n,
+    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
+    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
+    stride_og_m, stride_og_h,
+    stride_mask_m, stride_mask_h,
+    # Constexpr (now passed as arguments from the host)
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
+):
+    # --- PID Mapping: Based on the LARGER 4*H problem ---
+    pid = tl.program_id(axis=0)
+    N_4way = 4 * H
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # --- SHARED LayerNorm calculation (done only ONCE) ---
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    m_mask = offs_m < M
+    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
+    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        mean += tl.sum(x_chunk, axis=1)
+    mean /= K
+    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        x_centered = x_chunk - mean[:, None]
+        var += tl.sum(x_centered * x_centered, axis=1)
+    var /= K
+    rstd = 1.0 / tl.sqrt(var + LN_EPS)
+    # --- Matmul Loop 1: For the 4-Way Projections ---
+    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
+    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_block_start = k * BLOCK_SIZE_K;
+        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
+        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
+        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
+        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
+        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
+        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
+        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
+        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
+        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+        accumulator_4way += tl.dot(x_norm_tile, w_tile)
+        if pid_n * BLOCK_SIZE_N < H:
+            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
+            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
+            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
+            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+            accumulator_og += tl.dot(x_norm_tile, w_tile)
+    if pid_n * BLOCK_SIZE_N < H:
+        og_out = tl.sigmoid(accumulator_og)
+        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
+        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
+        tl.store(outg_ptrs, og_out, mask=og_mask)
+    # --- Fusion Logic for 4-Way Part ---
+    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
+    role_idx = tl.arange(0, 4)[None, None, :]
+    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
+    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
+    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
+    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
+    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
+    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
+    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
+    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
+    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
+    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
+    s1s2 = s1 * s2
+    offs_b  = offs_m // s1s2
+    offs_s1 = (offs_m % s1s2) // s2
+    offs_s2 = offs_m % s2
+    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
+    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
+    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
+    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
+    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
+                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
+    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
+                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1)
+    tl.store(outl_ptrs, left_out, mask=m_mask_h)
+    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
+@triton.jit
+def bmm_coalesced_kernel(
+    # Pointers
+    Left_ptr, Right_ptr, Out_ptr,
+    # Dimensions
+    bs, s1, s2, H,
+    # Strides
+    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
+    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
+    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
+    # Kernel parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_bh = tl.program_id(axis=1)
+    pid_b = pid_bh // H
+    pid_h = pid_bh % H
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
+    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
+        k_start = k * BLOCK_SIZE_K
+        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
+        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
+        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
+        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        accumulator += tl.dot(a, b)
+    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
+               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
+    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
+    tl.store(out_ptrs, accumulator, mask=c_mask)
+@triton.jit
+def fused_final_kernel(
+    # Pointers
+    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
+    # Metadata
+    M, H, D, s1,
+    # Strides
+    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
+    stride_gate_m, stride_gate_h,
+    stride_proj_d, stride_proj_h,
+    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
+    # Constants
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    m_mask = offs_m < M
+    s1s1 = s1 * s1
+    b = offs_m // s1s1
+    r = (offs_m % s1s1) // s1
+    c = offs_m % s1
+    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
+        sum_x += tl.sum(in_chunk, axis=1)
+        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
+    mean = sum_x / H
+    var = (sum_x2 / H) - (mean * mean)
+    rstd = tl.math.rsqrt(var + LN_EPS)
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_norm = (a - mean[:, None]) * rstd[:, None]
+        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
+        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
+        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
+        proj_ptrs = ProjW_ptr + offs_n[None, :] * stride_proj_d + offs_k[:, None] * stride_proj_h
+        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
+        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_gated = a_norm * gate
+        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
+        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
+    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
+    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
+    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
+def compiledtrimul_fused_interleaved_final(
+    x: torch.Tensor,
+    mask_mh: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    W_4way: torch.Tensor,
+    W_og: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int,
+):
+    bs, s1, s2, d = x.shape
+    M, K, H = bs * s1 * s2, x.shape[-1], h
+    x_flat = x.view(M, K)
+    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
+    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
+    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
+    # --- Kernel 1: Fused LN + Dual Matmul ---
+    N_4way = 4 * H
+    # Hardcoded A100 best config: M128-N128-K32-GM8-HC32-W8-S2
+    config_k1 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}
+    grid_k1 = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
+    fused_ln_dual_matmul_kernel[grid_k1](
+        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
+        left_final, right_final_t, og_mh,
+        M, H, K, s1, s2,
+        x_flat.stride(0), x_flat.stride(1), W_4way.stride(0), W_4way.stride(1),
+        W_og.stride(0), W_og.stride(1), left_final.stride(0), left_final.stride(1),
+        left_final.stride(2), left_final.stride(3), right_final_t.stride(0), right_final_t.stride(1),
+        right_final_t.stride(2), right_final_t.stride(3), og_mh.stride(0), og_mh.stride(1),
+        mask_mh.stride(0), mask_mh.stride(1),
+        LN_EPS=1e-5, **config_k1, num_warps=8, num_stages=2
+    )
+    # --- Kernel 2: Batched Matrix Multiplication ---
+    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
+    # Hardcoded A100 best config: M128-N64-K32-GM8-W4-S3
+    config_k2 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
+    grid_k2 = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
+    bmm_coalesced_kernel[grid_k2](
+        left_final, right_final_t, bmm_out_tmp,
+        bs, s1, s2, H,
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        **config_k2, num_warps=4, num_stages=3
+    )
+    # --- Kernel 3: Fully Fused Final Stage ---
+    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
+    # Hardcoded A100 best config: M32-N128-K32-GM8-W4-S3
+    config_k3 = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
+    grid_k3 = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
+    fused_final_kernel[grid_k3](
+        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
+        M, H, d, s1,
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        og_mh.stride(0), og_mh.stride(1), to_out_weight.stride(0), to_out_weight.stride(1),
+        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
+        LN_EPS=1e-5, **config_k3, num_warps=4, num_stages=3
+    )
+    return final_out
+def pack_w_4way_efficient(weights):
+    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
+    WL, WLG, WR, WRG = (weights[k] for k in ['left_proj.weight', 'left_gate.weight', 'right_proj.weight', 'right_gate.weight'])
+    H, K = WL.shape
+    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2).contiguous().view(4 * H, K)
+    return ws.t().to(torch.float16)
+def get_w_og(weights):
+    """ Gets the transposed [K, H] out_gate weight matrix. """
+    return weights['out_gate.weight'].t().to(torch.float16)
+@torch.compile()
+def compiledtrimul(
+    x: torch.Tensor, mask: torch.Tensor, norm_weight: torch.Tensor, norm_bias: torch.Tensor,
+    w_concat: torch.Tensor, to_out_norm_weight: torch.Tensor, to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor, h: int
+) -> torch.Tensor:
+    bs, s1, s2, d = x.shape
+    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
+    all_projections = torch.mm(x_norm, w_concat)
+    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
+    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
+    left = left * mask_expanded * torch.sigmoid(lg)
+    right = right * mask_expanded * torch.sigmoid(rg)
+    out_gate = torch.sigmoid(og)
+    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
+    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
+    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
+    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * out_gate
+    final_out_flat = gated @ to_out_weight.t()
+    return final_out_flat.view(bs, s1, s1, d)
+def small_kernel_pt_path(data):
+    input_tensor, mask, weights, config = data
+    w_concat = torch.cat([
+        weights['left_proj.weight'], weights['right_proj.weight'], weights['left_gate.weight'],
+        weights['right_gate.weight'], weights['out_gate.weight']
+    ], dim=0).t().contiguous().to(torch.float16)
+    return compiledtrimul(
+        x=input_tensor.to(torch.float32), mask=mask.unsqueeze(-1),
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32), w_concat=w_concat,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=config["hidden_dim"]
+    )
+def kernel_a100(data):
+    input_tensor, mask, weights, config = data
+    bs, s1, s2, d = input_tensor.shape
+    if s1 < 512: # Adjusted threshold based on observed BMM configs
+        return small_kernel_pt_path(data)
+    H = config["hidden_dim"]
+    W_4way = pack_w_4way_efficient(weights)
+    W_og = get_w_og(weights)
+    M = bs * s1 * s2
+    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16)
+    return compiledtrimul_fused_interleaved_final(
+        x=input_tensor.to(torch.float32),
+        mask_mh=mask_mh,
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        W_4way=W_4way,
+        W_og=W_og,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=H,
+    )

build/torch-rocm/triton_b200.py ADDED Viewed

	@@ -0,0 +1,411 @@

+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@triton.jit
+def fused_ln_dual_matmul_kernel(
+    # Pointers (9)
+    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
+    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
+    # Metadata (5)
+    M, H, K, s1, s2,
+    # Strides (16)
+    stride_x_m, stride_x_k,
+    stride_w4_k, stride_w4_n,
+    stride_wog_k, stride_wog_n,
+    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
+    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
+    stride_og_m, stride_og_h,
+    stride_mask_m, stride_mask_h,
+    # Constexpr (now passed as arguments from the host)
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
+):
+    # --- PID Mapping: Based on the LARGER 4*H problem ---
+    pid = tl.program_id(axis=0)
+    N_4way = 4 * H
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # --- SHARED LayerNorm calculation (done only ONCE) ---
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    m_mask = offs_m < M
+    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
+    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        mean += tl.sum(x_chunk, axis=1)
+    mean /= K
+    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        x_centered = x_chunk - mean[:, None]
+        var += tl.sum(x_centered * x_centered, axis=1)
+    var /= K
+    rstd = 1.0 / tl.sqrt(var + LN_EPS)
+    # --- Matmul Loop 1: For the 4-Way Projections ---
+    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
+    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_block_start = k * BLOCK_SIZE_K;
+        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
+        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
+        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
+        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
+        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
+        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
+        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
+        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
+        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+        accumulator_4way += tl.dot(x_norm_tile, w_tile)
+        #Some threads should calclate out_gate
+        if pid_n * BLOCK_SIZE_N < H:
+            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
+            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
+            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
+            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+            accumulator_og += tl.dot(x_norm_tile, w_tile)
+    if pid_n * BLOCK_SIZE_N < H:
+        og_out = tl.sigmoid(accumulator_og)
+        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
+        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
+        tl.store(outg_ptrs, og_out, mask=og_mask)
+    # --- Fusion Logic for 4-Way Part ---
+    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
+    role_idx = tl.arange(0, 4)[None, None, :]
+    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
+    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
+    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
+    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
+    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
+    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
+    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
+    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
+    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
+    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
+    s1s2 = s1 * s2
+    offs_b  = offs_m // s1s2
+    offs_s1 = (offs_m % s1s2) // s2
+    offs_s2 = offs_m % s2
+    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
+    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
+    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
+    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
+    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
+                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
+    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
+                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1)
+    tl.store(outl_ptrs, left_out, mask=m_mask_h)
+    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
+@triton.jit
+def bmm_coalesced_kernel(
+    # Pointers
+    Left_ptr, Right_ptr, Out_ptr,
+    # Dimensions
+    bs, s1, s2, H,
+    # Strides
+    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
+    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
+    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
+    # Kernel parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # Grid and program IDs
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_bh = tl.program_id(axis=1)
+    pid_b = pid_bh // H
+    pid_h = pid_bh % H
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
+    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
+        k_start = k * BLOCK_SIZE_K
+        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
+        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
+        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
+        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        accumulator += tl.dot(a, b)
+    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
+               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
+    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
+    tl.store(out_ptrs, accumulator, mask=c_mask)
+@triton.jit
+def fused_final_kernel(
+    # Pointers
+    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
+    # Metadata
+    M, H, D, s1,
+    # Strides
+    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
+    stride_gate_m, stride_gate_h,
+    stride_proj_d, stride_proj_h,
+    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
+    # Constants
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    m_mask = offs_m < M
+    s1s1 = s1 * s1
+    b = offs_m // s1s1
+    r = (offs_m % s1s1) // s1
+    c = offs_m % s1
+    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
+        sum_x += tl.sum(in_chunk, axis=1)
+        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
+    mean = sum_x / H
+    var = (sum_x2 / H) - (mean * mean)
+    rstd = tl.math.rsqrt(var + LN_EPS)
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_norm = (a - mean[:, None]) * rstd[:, None]
+        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
+        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
+        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
+        proj_ptrs = ProjW_ptr + offs_n[None, :] * stride_proj_d + offs_k[:, None] * stride_proj_h
+        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
+        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_gated = a_norm * gate
+        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
+        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
+    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
+    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
+    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
+def compiledtrimul_fused_interleaved_final(
+    x: torch.Tensor,
+    mask_mh: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    W_4way: torch.Tensor,
+    W_og: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int,
+):
+    bs, s1, s2, d = x.shape
+    M, K, H = bs * s1 * s2, x.shape[-1], h
+    x_flat = x.view(M, K)
+    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
+    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
+    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
+    # --- Kernel 1: Fused LN + Dual Matmul ---
+    # The grid is launched for the larger 4*H problem
+    N_4way = 4 * H
+    # Hardcoded best config from logs: M64-N128-K64-GM8-HC32-W4-S2
+    config_k1 = {'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
+    fused_ln_dual_matmul_kernel[grid](
+        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
+        left_final, right_final_t, og_mh,
+        M, H, K, s1, s2,
+        x_flat.stride(0), x_flat.stride(1), W_4way.stride(0), W_4way.stride(1),
+        W_og.stride(0), W_og.stride(1), left_final.stride(0), left_final.stride(1),
+        left_final.stride(2), left_final.stride(3), right_final_t.stride(0), right_final_t.stride(1),
+        right_final_t.stride(2), right_final_t.stride(3), og_mh.stride(0), og_mh.stride(1),
+        mask_mh.stride(0), mask_mh.stride(1),
+        LN_EPS=1e-5, **config_k1, num_warps=4, num_stages=2
+    )
+    # --- Kernel 2: Batched Matrix Multiplication ---
+    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
+    # Hardcoded best config from logs: M128-N128-K32-GM8-W8-S3
+    config_k2 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
+    grid_bmm = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
+    bmm_coalesced_kernel[grid_bmm](
+        left_final, right_final_t, bmm_out_tmp,
+        bs, s1, s2, H,
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        **config_k2, num_warps=8, num_stages=3
+    )
+    # --- Kernel 3: Fully Fused Final Stage ---
+    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
+    # Hardcoded best config from logs: M32-N128-K32-GM8-W4-S3
+    config_k3 = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
+    grid_final = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
+    fused_final_kernel[grid_final](
+        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
+        M, H, d, s1,
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        og_mh.stride(0), og_mh.stride(1), to_out_weight.stride(0), to_out_weight.stride(1),
+        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
+        LN_EPS=1e-5, **config_k3, num_warps=4, num_stages=3
+    )
+    return final_out
+def pack_w_4way_efficient(weights):
+    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
+    WL, WLG, WR, WRG = (weights[k] for k in ['left_proj.weight', 'left_gate.weight', 'right_proj.weight', 'right_gate.weight'])
+    H, K = WL.shape
+    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2).contiguous().view(4 * H, K)
+    return ws.t().to(torch.float16)
+def get_w_og(weights):
+    """ Gets the transposed [K, H] out_gate weight matrix. """
+    return weights['out_gate.weight'].t().to(torch.float16)
+@torch.compile()
+def compiledtrimul(
+    x: torch.Tensor, mask: torch.Tensor, norm_weight: torch.Tensor, norm_bias: torch.Tensor,
+    w_concat: torch.Tensor, to_out_norm_weight: torch.Tensor, to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor, h: int
+) -> torch.Tensor:
+    bs, s1, s2, d = x.shape
+    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
+    all_projections = torch.mm(x_norm, w_concat)
+    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
+    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
+    left = left * mask_expanded * torch.sigmoid(lg)
+    right = right * mask_expanded * torch.sigmoid(rg)
+    out_gate = torch.sigmoid(og)
+    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
+    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
+    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
+    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * out_gate
+    final_out_flat = gated @ to_out_weight.t()
+    return final_out_flat.view(bs, s1, s1, d)
+def small_kernel_pt_path(data):
+    input_tensor, mask, weights, config = data
+    w_concat = torch.cat([
+        weights['left_proj.weight'], weights['right_proj.weight'], weights['left_gate.weight'],
+        weights['right_gate.weight'], weights['out_gate.weight']
+    ], dim=0).t().contiguous().to(torch.float16)
+    return compiledtrimul(
+        x=input_tensor.to(torch.float32), mask=mask.unsqueeze(-1),
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32), w_concat=w_concat,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=config["hidden_dim"]
+    )
+def kernel_b200(data):
+    input_tensor, mask, weights, config = data
+    bs, s1, s2, d = input_tensor.shape
+    if s1 < 800:
+        return small_kernel_pt_path(data)
+    H = config["hidden_dim"]
+    W_4way = pack_w_4way_efficient(weights)
+    W_og = get_w_og(weights)
+    M = bs * s1 * s2
+    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16)
+    return compiledtrimul_fused_interleaved_final(
+        x=input_tensor.to(torch.float32),
+        mask_mh=mask_mh,
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        W_4way=W_4way,
+        W_og=W_og,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=H,
+    )

build/torch-rocm/triton_h100.py ADDED Viewed

	@@ -0,0 +1,509 @@

+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16},  num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=5),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=4, num_stages=5),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=2, num_stages=4),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def fused_ln_dual_matmul_kernel(
+    # Pointers (9)
+    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
+    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
+    # Metadata (5)
+    M, H, K, s1, s2,
+    # Strides (16)
+    stride_x_m, stride_x_k,
+    stride_w4_k, stride_w4_n,
+    stride_wog_k, stride_wog_n,
+    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
+    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
+    stride_og_m, stride_og_h,
+    stride_mask_m, stride_mask_h,
+    # Constexpr (from decorator and kwargs)
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
+):
+    # --- PID Mapping: Based on the LARGER 4*H problem ---
+    pid = tl.program_id(axis=0)
+    N_4way = 4 * H
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # --- SHARED LayerNorm calculation (done only ONCE) ---
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    m_mask = offs_m < M
+    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
+    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        mean += tl.sum(x_chunk, axis=1)
+    mean /= K
+    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        x_centered = x_chunk - mean[:, None]
+        var += tl.sum(x_centered * x_centered, axis=1)
+    var /= K
+    rstd = 1.0 / tl.sqrt(var + LN_EPS)
+    # --- Matmul Loop 1: For the 4-Way Projections ---
+    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
+    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_block_start = k * BLOCK_SIZE_K;
+        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
+        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
+        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
+        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
+        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
+        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
+        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
+        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
+        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+        accumulator_4way += tl.dot(x_norm_tile, w_tile)
+        #Some threads should calclate out_gate
+        if pid_n * BLOCK_SIZE_N < H:
+            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
+            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
+            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
+            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+            accumulator_og += tl.dot(x_norm_tile, w_tile)
+    if pid_n * BLOCK_SIZE_N < H:
+        og_out = tl.sigmoid(accumulator_og)
+        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
+        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
+        tl.store(outg_ptrs, og_out, mask=og_mask)
+    # --- Fusion Logic for 4-Way Part ---
+    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
+    role_idx = tl.arange(0, 4)[None, None, :]
+    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
+    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
+    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
+    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
+    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
+    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
+    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
+    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
+    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
+    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
+    s1s2 = s1 * s2
+    offs_b  = offs_m // s1s2
+    offs_s1 = (offs_m % s1s2) // s2
+    offs_s2 = offs_m % s2
+    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
+    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
+    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
+    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
+    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
+                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
+    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
+                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1) # s2 offset uses s2 stride, s1 offset uses s1 stride
+    tl.store(outl_ptrs, left_out, mask=m_mask_h)
+    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+    ],
+    key=['s1', 's2', 'H'],
+)
+@triton.jit
+def bmm_coalesced_kernel(
+    # Pointers
+    Left_ptr, Right_ptr, Out_ptr,
+    # Dimensions
+    bs, s1, s2, H,
+    # Strides
+    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
+    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
+    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
+    # Kernel parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # Grid and program IDs
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_bh = tl.program_id(axis=1)
+    pid_b = pid_bh // H
+    pid_h = pid_bh % H
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
+    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
+        k_start = k * BLOCK_SIZE_K
+        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
+        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
+        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
+        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        accumulator += tl.dot(a, b)
+    # --- Coalesced Write ---
+    # Write to a standard (bs, H, s1, s1) layout
+    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
+               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
+    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
+    tl.store(out_ptrs, accumulator, mask=c_mask)
+@torch.compile
+def torch_pt2(left_final, right_final_t, bs, s1, s2, d, h, to_out_norm_weight, to_out_norm_bias, og_mh, to_out_weight):
+    bmm_out = torch.matmul(left_final, right_final_t)
+    out_einsum_flat = bmm_out.permute(0, 2, 3, 1).reshape(bs * s1 * s1, h)
+    # Apply layer norm and final gating
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * og_mh
+    # Final projection
+    final_out_flat = gated @ to_out_weight.t()
+    final_out = final_out_flat.view(bs, s1, s2, d)
+    return final_out
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
+    ],
+    key=['H', 'D'],
+)
+@triton.jit
+def fused_final_kernel(
+    # Pointers
+    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
+    # Metadata
+    M, H, D, s1, # M_gate = bs*s1*s2
+    # Strides
+    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
+    stride_gate_m, stride_gate_h,
+    stride_proj_d, stride_proj_h,
+    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
+    # Constants
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # --- Grid and PID Setup for Matmul ---
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    m_mask = offs_m < M
+    # Decompose M back to (b, r, c) for reordering lookups
+    s1s1 = s1 * s1
+    b = offs_m // s1s1
+    r = (offs_m % s1s1) // s1
+    c = offs_m % s1
+    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
+        # Accumulate sum and sum of squares in one pass
+        sum_x += tl.sum(in_chunk, axis=1)
+        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
+    # Finalize statistics
+    mean = sum_x / H
+    var = (sum_x2 / H) - (mean * mean)
+    rstd = tl.math.rsqrt(var + LN_EPS)
+    # --- Pass 3: Fused Gating and Matmul ---
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_norm = (a - mean[:, None]) * rstd[:, None]
+        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
+        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
+        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
+        proj_ptrs = ProjW_ptr + \
+                    offs_n[None, :] * stride_proj_d + \
+                    offs_k[:, None] * stride_proj_h
+        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
+        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_gated = a_norm * gate
+        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
+        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
+    # --- Store Final Output ---
+    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
+    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
+    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
+def compiledtrimul_fused_interleaved(
+    x: torch.Tensor,
+    mask_mh: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    W_4way: torch.Tensor, # Use the new weight matrices
+    W_og: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int,
+):
+    bs, s1, s2, d = x.shape
+    M, K, H = bs * s1 * s2, x.shape[-1], h
+    x_flat = x.view(M, K)
+    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
+    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
+    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
+    # The grid is launched for the larger 4*H problem
+    N_4way = 4 * H
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
+    fused_ln_dual_matmul_kernel[grid](
+        # Pointers (9)
+        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
+        left_final, right_final_t, og_mh,
+        # Metadata (5) - M, H, K, s1, s2
+        M, H, K, s1, s2,
+        # Strides (16)
+        x_flat.stride(0), x_flat.stride(1),
+        W_4way.stride(0), W_4way.stride(1),
+        W_og.stride(0), W_og.stride(1),
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        og_mh.stride(0), og_mh.stride(1),
+        mask_mh.stride(0), mask_mh.stride(1),
+        # Constexpr (1)
+        LN_EPS=1e-5
+    )
+    return torch_pt2(
+        left_final, right_final_t,
+        bs=bs,
+        s1=s1,
+        s2=s2,
+        d=d,
+        h=h,
+        to_out_norm_weight=to_out_norm_weight,
+        to_out_norm_bias=to_out_norm_bias,
+        og_mh=og_mh,
+        to_out_weight=to_out_weight
+    )
+def pack_w_4way_efficient(weights):
+    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
+    WL = weights['left_proj.weight']
+    WLG = weights['left_gate.weight']
+    WR = weights['right_proj.weight']
+    WRG = weights['right_gate.weight']
+    H, K = WL.shape
+    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2)
+    ws = ws.contiguous().view(4 * H, K)
+    return ws.t().to(torch.float16)
+def get_w_og(weights):
+    """ Gets the transposed [K, H] out_gate weight matrix. """
+    WOG = weights['out_gate.weight']
+    return WOG.t().to(torch.float16)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@torch.compile
+def compiledtrimul(
+    x: torch.Tensor,
+    mask: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    w_concat: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int
+) -> torch.Tensor:
+    """
+    A barebones, compiled PyTorch function for the TriMul logic.
+    """
+    bs, s1, s2, d = x.shape
+    # Initial LayerNorm
+    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
+    # Single large matmul: [M, d] @ [d, 5h] = [M, 5h]
+    all_projections = torch.mm(x_norm, w_concat)
+    # Split back into individual projections
+    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
+    # Apply mask and gates
+    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
+    left = left * mask_expanded * torch.sigmoid(lg)
+    right = right * mask_expanded * torch.sigmoid(rg)
+    out_gate = torch.sigmoid(og)
+    # Reshape for einsum
+    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
+    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
+    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
+    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
+    # Apply layer norm and final gating
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * out_gate
+    # Final projection
+    final_out_flat = gated @ to_out_weight.t()
+    final_out = final_out_flat.view(bs, s1, s2, d)
+    return final_out
+def small_kernel_pt_path(data):
+    input_tensor, mask, weights, config = data
+    w_concat = torch.cat([
+        weights['left_proj.weight'],
+        weights['right_proj.weight'],
+        weights['left_gate.weight'],
+        weights['right_gate.weight'],
+        weights['out_gate.weight']
+    ], dim=0).t().contiguous().to(torch.float16)
+    # Call the compiled function with prepared weights
+    output = compiledtrimul(
+        x=input_tensor.to(torch.float32),
+        mask=mask.unsqueeze(-1),
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        w_concat=w_concat,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float32),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float32),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=config["hidden_dim"]
+    )
+    return output
+def kernel_h100(data):
+    input_tensor, mask, weights, config = data
+    bs, s1, s2, d = input_tensor.shape
+    if s1 <= 512:
+        return small_kernel_pt_path(data)
+    H = config["hidden_dim"]
+    W_4way = pack_w_4way_efficient(weights)
+    W_og = get_w_og(weights)
+    M = bs * s1 * s2
+    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16) #move into kernel possibly
+    return compiledtrimul_fused_interleaved(
+        x=input_tensor.to(torch.float32),
+        mask_mh=mask_mh,
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        W_4way=W_4way, # Pass the new 4-way matrix
+        W_og=W_og,     # Pass the new out_gate matrix
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=H,
+    )

build/torch-xpu/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .triton_a100 import kernel_a100
+from .triton_h100 import kernel_h100
+from .triton_b200 import kernel_b200
+from .trimul_mi300 import kernel_mi300
+from .trimul_global import kernel_global
+__all__ = ["kernel_a100", "kernel_h100", "kernel_b200", "kernel_mi300", "kernel_global"]

build/torch-xpu/_ops.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+ops = torch.ops._trimul_gpumode_176b4e4
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_trimul_gpumode_176b4e4::{op_name}"

build/torch-xpu/metadata.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"python-depends":[]}

build/torch-xpu/task.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""
+Type definitions for TriMul task.
+Input: Tuple of (input_tensor, mask, weights, config)
+  - input_tensor: Input tensor of shape [batch_size, seq_len, seq_len, dim]
+  - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
+  - weights: Dictionary containing model weights
+  - config: Dictionary containing model configuration parameters
+Output: Output tensor of shape [batch_size, seq_len, seq_len, dim]
+"""
+import torch
+from typing import Tuple, Dict, Any
+# Input type: (input_tensor, mask, weights, config)
+input_t = Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], Dict[str, Any]]
+# Output type: output tensor
+output_t = torch.Tensor

build/torch-xpu/trimul_global.py ADDED Viewed

	@@ -0,0 +1,971 @@

+# from utils import make_match_reference, DisableCuDNNTF32
+from .task import input_t, output_t
+import torch
+from torch import nn, einsum
+import math
+import os
+import requests
+import triton
+import triton.language as tl
+# The flag below controls whether to allow TF32 on matmul. This flag defaults to False
+# in PyTorch 1.12 and later.
+torch.backends.cuda.matmul.allow_tf32 = True
+# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
+torch.backends.cudnn.allow_tf32 = True
+# Set allocator for TMA descriptors (required for on-device TMA)
+def alloc_fn(size: int, alignment: int, stream=None):
+    return torch.empty(size, device="cuda", dtype=torch.int8)
+triton.set_allocator(alloc_fn)
+# os.environ['TRITON_PRINT_AUTOTUNING'] = '1'
+# os.environ['MLIR_ENABLE_DIAGNOSTICS'] = 'warnings,remarks'
+# Reference code in PyTorch
+class TriMul(nn.Module):
+    # Based on https://github.com/lucidrains/triangle-multiplicative-module/blob/main/triangle_multiplicative_module/triangle_multiplicative_module.py
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+    ):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
+        self.left_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
+        self.to_out_norm = nn.LayerNorm(hidden_dim)
+        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        """
+        x: [bs, seq_len, seq_len, dim]
+        mask: [bs, seq_len, seq_len]
+        Returns:
+            output: [bs, seq_len, seq_len, dim]
+        """
+        batch_size, seq_len, _, dim = x.shape
+        x = self.norm(x)
+        left = self.left_proj(x)
+        right = self.right_proj(x)
+        mask = mask.unsqueeze(-1)
+        left = left * mask
+        right = right * mask
+        left_gate = self.left_gate(x).sigmoid()
+        right_gate = self.right_gate(x).sigmoid()
+        out_gate = self.out_gate(x).sigmoid()
+        left = left * left_gate
+        right = right * right_gate
+        out = einsum('... i k d, ... j k d -> ... i j d', left, right)
+        # This einsum is the same as the following:
+        # out = torch.zeros(batch_size, seq_len, seq_len, dim, device=x.device)
+        # # Compute using nested loops
+        # for b in range(batch_size):
+        #     for i in range(seq_len):
+        #         for j in range(seq_len):
+        #             # Compute each output element
+        #             for k in range(seq_len):
+        #                 out[b, i, j] += left[b, i, k, :] * right[b, j, k, :]
+        out = self.to_out_norm(out)
+        out = out * out_gate
+        return self.to_out(out)
+@triton.jit
+def triton_sigmoid(x):
+    """
+    Compute sigmoid function: 1 / (1 + exp(-x))
+    """
+    return 1.0 / (1.0 + tl.exp(-x))
+def two_mm_kernel_configs_wrapper():
+    if torch.cuda.get_device_capability() == (12, 0):
+        def two_mm_kernel_configs():
+            configs = []
+            for BLOCK_M in [16, 32]:
+                for BLOCK_N in [16, 32, 64]:
+                    for BLOCK_K in [16, 32, 64]:
+                        for num_stages in [2, 3]:
+                            configs.append(triton.Config({
+                                'BLOCK_M': BLOCK_M,
+                                'BLOCK_N': BLOCK_N,
+                                'BLOCK_K': BLOCK_K,
+                                'GROUP_SIZE_M': 8
+                            }, num_stages=num_stages, num_warps=8))
+            return configs
+    elif torch.cuda.get_device_capability()[0] == 9:
+        def get_optimal_two_mm_config_h100(B, seq_len, dim):
+            configs = {
+                (1, 128, 128): (128, 64, 128, 2, 8),
+                (1, 128, 256): (128, 64, 128, 2, 8),
+                (1, 128, 384): (128, 64, 64, 3, 8),
+                (1, 128, 512): (128, 64, 64, 3, 8),
+                (1, 128, 768): (128, 64, 64, 3, 8),
+                (1, 128, 1024): (128, 64, 64, 3, 8),
+                (1, 256, 128): (128, 64, 128, 2, 8),
+                (1, 256, 256): (128, 64, 128, 2, 8),
+                (1, 256, 384): (128, 64, 64, 3, 8),
+                (1, 256, 512): (128, 64, 64, 3, 8),
+                (1, 256, 768): (128, 64, 64, 3, 8),
+                (1, 256, 1024): (128, 64, 64, 3, 8),
+                (1, 512, 128): (128, 64, 128, 2, 8),
+                (1, 512, 256): (128, 64, 128, 2, 8),
+                (1, 512, 384): (128, 64, 128, 2, 8),
+                (1, 512, 512): (128, 64, 128, 2, 8),
+                (1, 512, 768): (128, 64, 64, 3, 8),
+                (1, 512, 1024): (128, 64, 64, 3, 8),
+                (1, 1024, 128): (128, 64, 128, 2, 8),
+                (1, 1024, 256): (128, 64, 64, 2, 8),
+                (1, 1024, 384): (128, 64, 128, 2, 8),
+                (1, 1024, 512): (128, 64, 128, 2, 8),
+                (1, 1024, 768): (128, 64, 128, 2, 8),
+                (1, 1024, 1024): (128, 64, 128, 2, 8),
+                (2, 128, 128): (128, 64, 128, 2, 8),
+                (2, 128, 256): (128, 64, 128, 2, 8),
+                (2, 128, 384): (128, 64, 64, 3, 8),
+                (2, 128, 512): (128, 64, 64, 3, 8),
+                (2, 128, 768): (128, 64, 64, 3, 8),
+                (2, 128, 1024): (128, 64, 64, 3, 8),
+                (2, 256, 128): (128, 64, 128, 2, 8),
+                (2, 256, 256): (128, 64, 128, 2, 8),
+                (2, 256, 384): (128, 64, 128, 2, 8),
+                (2, 256, 512): (128, 64, 128, 2, 8),
+                (2, 256, 768): (128, 64, 64, 3, 8),
+                (2, 256, 1024): (128, 64, 64, 3, 8),
+                (2, 512, 128): (128, 64, 128, 2, 8),
+                (2, 512, 256): (128, 64, 128, 2, 8),
+                (2, 512, 384): (128, 64, 128, 2, 8),
+                (2, 512, 512): (128, 64, 128, 2, 8),
+                (2, 512, 768): (128, 64, 128, 2, 8),
+                (2, 512, 1024): (128, 64, 128, 2, 8),
+                (2, 1024, 128): (128, 64, 128, 2, 8),
+                (2, 1024, 256): (128, 64, 128, 2, 8),
+                (2, 1024, 384): (128, 64, 128, 2, 8),
+                (2, 1024, 512): (128, 64, 128, 2, 8),
+                (2, 1024, 768): (128, 64, 128, 2, 8),
+                (2, 1024, 1024): (128, 64, 128, 2, 8),
+            }
+            return configs.get((B, seq_len, dim), (64, 64, 32, 2, 8))  # default fallback
+        def two_mm_kernel_configs():
+            # This function is kept for compatibility but will be overridden for H100
+            return [
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
+            ]
+    elif torch.cuda.get_device_capability()[0] == 10 and False:
+        def get_optimal_two_mm_config(B, seq_len, dim):
+            configs = {
+                (1, 128, 128): (64, 128, 64, 2, 8),
+                (1, 128, 256): (128, 64, 128, 2, 8),
+                (1, 128, 384): (128, 64, 128, 2, 8),
+                (1, 128, 512): (128, 64, 128, 2, 8),
+                (1, 128, 768): (128, 64, 64, 3, 8),
+                (1, 128, 1024): (128, 64, 64, 3, 8),
+                (1, 256, 128): (128, 64, 128, 2, 8),
+                (1, 256, 256): (128, 64, 128, 2, 8),
+                (1, 256, 384): (128, 64, 128, 2, 8),
+                (1, 256, 512): (128, 64, 64, 3, 8),
+                (1, 256, 768): (128, 64, 64, 3, 8),
+                (1, 256, 1024): (128, 64, 64, 3, 8),
+                (1, 512, 128): (128, 64, 128, 2, 8),
+                (1, 512, 256): (128, 64, 128, 2, 8),
+                (1, 512, 384): (128, 64, 128, 2, 8),
+                (1, 512, 512): (128, 64, 128, 2, 8),
+                (1, 512, 768): (128, 64, 64, 3, 8),
+                (1, 512, 1024): (128, 64, 64, 3, 8),
+                (1, 1024, 128): (128, 64, 128, 2, 8),
+                (1, 1024, 256): (128, 64, 128, 2, 8),
+                (1, 1024, 384): (128, 64, 128, 2, 8),
+                (1, 1024, 512): (128, 64, 128, 2, 8),
+                (1, 1024, 768): (128, 64, 64, 3, 8),
+                (1, 1024, 1024): (128, 64, 64, 3, 8),
+                (2, 128, 128): (128, 64, 128, 2, 8),
+                (2, 128, 256): (128, 64, 128, 2, 8),
+                (2, 128, 384): (128, 64, 128, 2, 8),
+                (2, 128, 512): (128, 64, 64, 3, 8),
+                (2, 128, 768): (128, 64, 64, 3, 8),
+                (2, 128, 1024): (128, 64, 64, 3, 8),
+                (2, 256, 128): (128, 64, 128, 2, 8),
+                (2, 256, 256): (128, 64, 128, 2, 8),
+                (2, 256, 384): (128, 64, 128, 2, 8),
+                (2, 256, 512): (128, 64, 64, 3, 8),
+                (2, 256, 768): (128, 64, 64, 3, 8),
+                (2, 256, 1024): (128, 64, 64, 3, 8),
+                (2, 512, 128): (128, 64, 128, 2, 8),
+                (2, 512, 256): (128, 64, 128, 2, 8),
+                (2, 512, 384): (128, 64, 128, 2, 8),
+                (2, 512, 512): (128, 64, 128, 2, 8),
+                (2, 512, 768): (128, 64, 64, 3, 8),
+                (2, 512, 1024): (128, 64, 64, 3, 8),
+                (2, 1024, 128): (128, 64, 128, 2, 8),
+                (2, 1024, 256): (128, 64, 128, 2, 8),
+                (2, 1024, 384): (128, 64, 128, 2, 8),
+                (2, 1024, 512): (128, 64, 128, 2, 8),
+                (2, 1024, 768): (128, 64, 64, 3, 8),
+                (2, 1024, 1024): (128, 64, 64, 3, 8),
+            }
+            return configs.get((B, seq_len, dim), (64, 64, 32, 2, 8))  # default fallback
+        def two_mm_kernel_configs():
+            # This function is kept for compatibility but will be overridden
+            return [
+                triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
+                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
+            ]
+    elif torch.cuda.get_device_capability()[0] == 8:
+        # A100
+        def two_mm_kernel_configs():
+            configs = []
+            for BLOCK_M in [64]:
+                for BLOCK_N in [64, 128]:
+                    for BLOCK_K in [16]:
+                        for num_stages in [3, 4]:
+                            for num_warps in [4, 8]:
+                                configs.append(triton.Config({
+                                    'BLOCK_M': BLOCK_M,
+                                    'BLOCK_N': BLOCK_N,
+                                    'BLOCK_K': BLOCK_K,
+                                    'GROUP_SIZE_M': 8
+                                }, num_stages=num_stages, num_warps=num_warps))
+            return configs
+    else:
+        def two_mm_kernel_configs():
+            configs = []
+            for BLOCK_M in [64, 128]:
+                for BLOCK_N in [64, 128]:
+                    for BLOCK_K in [64, 128]:
+                        for num_stages in [2, 3]:
+                            configs.append(triton.Config({
+                                'BLOCK_M': BLOCK_M,
+                                'BLOCK_N': BLOCK_N,
+                                'BLOCK_K': BLOCK_K,
+                                'GROUP_SIZE_M': 8
+                            }, num_stages=num_stages, num_warps=8))
+            return configs
+    return two_mm_kernel_configs
+def two_mm_kernel_wrapper():
+    if torch.cuda.get_device_capability()[0] == 8:
+        @triton.jit
+        def two_mm_kernel(a_ptr, b1_ptr, b2_ptr, b3_ptr, b4_ptr, b5_ptr, c1_ptr, c2_ptr, d_ptr, mask_ptr, M, N, K, stride_a0, stride_a1, stride_a2, stride_a3, stride_bk, stride_bn, stride_c0, stride_c1, stride_c2, stride_c3, seq_len, stride_d0, stride_d1, stride_d2, stride_d3, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
+            # Persistent kernel using standard tl.load operations
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            k_tiles = tl.cdiv(K, BLOCK_K)
+            num_tiles = num_pid_m * num_pid_n
+            # tile_id_c is used in the epilogue to break the dependency between
+            # the prologue and the epilogue
+            tile_id_c = start_pid - NUM_SMS
+            num_pid_in_group = GROUP_SIZE_M * num_pid_n
+            # Persistent loop over tiles
+            for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=False):
+                # Calculate PID for this tile using improved swizzling
+                group_id = tile_id // num_pid_in_group
+                first_pid_m = group_id * GROUP_SIZE_M
+                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+                pid_m = first_pid_m + (tile_id % group_size_m)
+                pid_n = (tile_id % num_pid_in_group) // group_size_m
+                # Calculate block offsets
+                offs_am = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_bn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+                offs_k = tl.arange(0, BLOCK_K)
+                # Initialize accumulators for all outputs
+                accumulator1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator3 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator4 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator_d = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                # Main computation loop over K dimension
+                for ki in range(k_tiles):
+                    k_start = ki * BLOCK_K
+                    k_offsets = k_start + offs_k
+                    # Create pointers for A matrix (2D flattened view)
+                    a_ptrs = a_ptr + offs_am[:, None] * stride_a2 + k_offsets[None, :] * stride_a3
+                    a_mask = (offs_am[:, None] < M) & (k_offsets[None, :] < K)
+                    # Create pointers for B matrices [N, K] layout
+                    b1_ptrs = b1_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b2_ptrs = b2_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b3_ptrs = b3_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b4_ptrs = b4_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b5_ptrs = b5_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
+                    b_mask = (offs_bn[:, None] < N) & (k_offsets[None, :] < K)
+                    # Load blocks from A and all weight matrices using standard tl.load
+                    a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+                    b1 = tl.load(b1_ptrs, mask=b_mask, other=0.0)
+                    b2 = tl.load(b2_ptrs, mask=b_mask, other=0.0)
+                    b3 = tl.load(b3_ptrs, mask=b_mask, other=0.0)
+                    b4 = tl.load(b4_ptrs, mask=b_mask, other=0.0)
+                    b5 = tl.load(b5_ptrs, mask=b_mask, other=0.0)
+                    # Perform matrix multiplications using TF32
+                    accumulator1 = tl.dot(a, b1.T, accumulator1, allow_tf32=True)  # A @ B1.T
+                    accumulator2 = tl.dot(a, b2.T, accumulator2, allow_tf32=True)  # A @ B2.T
+                    accumulator3 = tl.dot(a, b3.T, accumulator3, allow_tf32=True)  # A @ B3.T
+                    accumulator4 = tl.dot(a, b4.T, accumulator4, allow_tf32=True)  # A @ B4.T
+                    accumulator_d = tl.dot(a, b5.T, accumulator_d, allow_tf32=True)  # A @ B5.T
+                # Store results using separate tile_id_c for epilogue
+                tile_id_c += NUM_SMS
+                group_id = tile_id_c // num_pid_in_group
+                first_pid_m = group_id * GROUP_SIZE_M
+                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+                pid_m = first_pid_m + (tile_id_c % group_size_m)
+                pid_n = (tile_id_c % num_pid_in_group) // group_size_m
+                # Calculate output offsets and pointers
+                offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+                # Create masks for bounds checking
+                d_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+                # Calculate pointer addresses using 4D strides
+                stride_cm = stride_c2  # Stride to next element in flattened M dimension
+                stride_cn = stride_c3  # N is the innermost dimension
+                # For D tensor: use separate D strides
+                stride_dm = stride_d2  # Stride to next element in flattened M dimension
+                stride_dn = stride_d3  # N is the innermost dimension
+                off_c_batch = offs_cm // (seq_len * seq_len)
+                off_c_sl1 = (offs_cm // seq_len) % seq_len
+                off_c_sl2 = offs_cm % seq_len
+                off_c_dim = offs_cn
+                c_offsets = (off_c_batch * stride_c0 + off_c_sl1 * stride_c1 + off_c_sl2 * stride_c2)[:, None] + off_c_dim[None, :] * stride_c3
+                c_mask = d_mask
+                c1_ptrs = c1_ptr + c_offsets
+                c2_ptrs = c2_ptr + c_offsets
+                d_ptrs = d_ptr + stride_dm * offs_cm[:, None] + stride_dn * offs_cn[None, :]
+                mask = tl.load(mask_ptr + offs_cm, mask=(offs_cm < M))
+                # Broadcast mask to match accumulator dimensions [BLOCK_M, BLOCK_N]
+                mask_2d = mask[:, None]  # Convert to [BLOCK_M, 1] then broadcast
+                # Apply masking only to left_proj and right_proj results (C1, C2)
+                accumulator1 = tl.where(mask_2d, accumulator1, 0)
+                accumulator2 = tl.where(mask_2d, accumulator2, 0)
+                # Apply sigmoid to gate values
+                left_gate_sigmoid = triton_sigmoid(accumulator3)
+                right_gate_sigmoid = triton_sigmoid(accumulator4)
+                accumulator_d = triton_sigmoid(accumulator_d)
+                # Apply elementwise multiplication with gated values
+                # C1 = left * left_gate, C2 = right * right_gate
+                accumulator1 = accumulator1 * left_gate_sigmoid  # left * left_gate
+                accumulator2 = accumulator2 * right_gate_sigmoid  # right * right_gate
+                # Convert to appropriate output dtype and store with normal tl.store
+                c1 = accumulator1.to(c1_ptr.dtype.element_ty)
+                c2 = accumulator2.to(c2_ptr.dtype.element_ty)
+                d = accumulator_d.to(d_ptr.dtype.element_ty)
+                tl.store(c1_ptrs, c1, mask=c_mask)
+                tl.store(c2_ptrs, c2, mask=c_mask)
+                tl.store(d_ptrs, d, mask=d_mask)
+    else:
+        @triton.jit
+        def two_mm_kernel(a_ptr, b1_ptr, b2_ptr, b3_ptr, b4_ptr, b5_ptr, c1_ptr, c2_ptr, d_ptr, mask_ptr, M, N, K, stride_a0, stride_a1, stride_a2, stride_a3, stride_bk, stride_bn, stride_c0, stride_c1, stride_c2, stride_c3, seq_len, stride_d0, stride_d1, stride_d2, stride_d3, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
+            # Persistent kernel using on-device TMA descriptors
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            k_tiles = tl.cdiv(K, BLOCK_K)
+            num_tiles = num_pid_m * num_pid_n
+            # Create on-device TMA descriptors
+            a_desc = tl._experimental_make_tensor_descriptor(
+                a_ptr,
+                shape=[M, K],
+                strides=[stride_a2, stride_a3],
+                block_shape=[BLOCK_M, BLOCK_K],
+            )
+            b1_desc = tl._experimental_make_tensor_descriptor(
+                b1_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            b2_desc = tl._experimental_make_tensor_descriptor(
+                b2_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            b3_desc = tl._experimental_make_tensor_descriptor(
+                b3_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            b4_desc = tl._experimental_make_tensor_descriptor(
+                b4_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            b5_desc = tl._experimental_make_tensor_descriptor(
+                b5_ptr,
+                shape=[N, K],
+                strides=[stride_bn, stride_bk],
+                block_shape=[BLOCK_N, BLOCK_K],
+            )
+            # tile_id_c is used in the epilogue to break the dependency between
+            # the prologue and the epilogue
+            tile_id_c = start_pid - NUM_SMS
+            num_pid_in_group = GROUP_SIZE_M * num_pid_n
+            # Persistent loop over tiles
+            for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=False):
+                # Calculate PID for this tile using improved swizzling
+                group_id = tile_id // num_pid_in_group
+                first_pid_m = group_id * GROUP_SIZE_M
+                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+                pid_m = first_pid_m + (tile_id % group_size_m)
+                pid_n = (tile_id % num_pid_in_group) // group_size_m
+                # Calculate block offsets
+                offs_am = pid_m * BLOCK_M
+                offs_bn = pid_n * BLOCK_N
+                # Initialize accumulators for all outputs
+                accumulator1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator3 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator4 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                accumulator_d = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+                # Main computation loop over K dimension
+                for ki in range(k_tiles):
+                    offs_k = ki * BLOCK_K
+                    # Load blocks from A and all weight matrices using on-device TMA
+                    a = a_desc.load([offs_am, offs_k])
+                    b1 = b1_desc.load([offs_bn, offs_k])
+                    b2 = b2_desc.load([offs_bn, offs_k])
+                    b3 = b3_desc.load([offs_bn, offs_k])
+                    b4 = b4_desc.load([offs_bn, offs_k])
+                    b5 = b5_desc.load([offs_bn, offs_k])
+                    # Perform matrix multiplications using TF32
+                    accumulator1 = tl.dot(a, b1.T, accumulator1, allow_tf32=True)  # A @ B1.T
+                    accumulator2 = tl.dot(a, b2.T, accumulator2, allow_tf32=True)  # A @ B2.T
+                    accumulator3 = tl.dot(a, b3.T, accumulator3, allow_tf32=True)  # A @ B3.T
+                    accumulator4 = tl.dot(a, b4.T, accumulator4, allow_tf32=True)  # A @ B4.T
+                    accumulator_d = tl.dot(a, b5.T, accumulator_d, allow_tf32=True)  # A @ B5.T
+                # Store results using separate tile_id_c for epilogue
+                tile_id_c += NUM_SMS
+                group_id = tile_id_c // num_pid_in_group
+                first_pid_m = group_id * GROUP_SIZE_M
+                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+                pid_m = first_pid_m + (tile_id_c % group_size_m)
+                pid_n = (tile_id_c % num_pid_in_group) // group_size_m
+                # Calculate output offsets and pointers
+                offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+                offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+                # Create masks for bounds checking
+                d_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+                # Calculate pointer addresses using 4D strides
+                # For C tensors: compute effective 2D strides from 4D strides
+                # Output tensor is [B, I, J, N], flattened to [M, N] where M = B*I*J
+                stride_cm = stride_c2  # Stride to next element in flattened M dimension
+                stride_cn = stride_c3  # N is the innermost dimension
+                # For D tensor: use separate D strides
+                stride_dm = stride_d2  # Stride to next element in flattened M dimension
+                stride_dn = stride_d3  # N is the innermost dimension
+                off_c_batch = offs_cm // (seq_len * seq_len)
+                off_c_sl1 = (offs_cm // seq_len) % seq_len
+                off_c_sl2 = offs_cm % seq_len
+                off_c_dim = offs_cn
+                # TODO update the mask_c so we don't IMA
+                c_offsets = (off_c_batch * stride_c0 + off_c_sl1 * stride_c1 + off_c_sl2 * stride_c2)[:, None] + off_c_dim[None, :] * stride_c3
+                # c_offsets = offs_cm[:, None] * stride_c2 + offs_cn[None, :] * stride_c3
+                c_mask = d_mask
+                c1_ptrs = c1_ptr + c_offsets
+                c2_ptrs = c2_ptr + c_offsets
+                # c1_ptrs = c1_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+                # c2_ptrs = c2_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+                d_ptrs = d_ptr + stride_dm * offs_cm[:, None] + stride_dn * offs_cn[None, :]
+                mask = tl.load(mask_ptr + offs_cm, mask=(offs_cm < M))
+                # Broadcast mask to match accumulator dimensions [BLOCK_M, BLOCK_N]
+                mask_2d = mask[:, None]  # Convert to [BLOCK_M, 1] then broadcast
+                # Apply masking only to left_proj and right_proj results (C1, C2)
+                accumulator1 = tl.where(mask_2d, accumulator1, 0)
+                accumulator2 = tl.where(mask_2d, accumulator2, 0)
+                # Apply sigmoid to gate values
+                left_gate_sigmoid = triton_sigmoid(accumulator3)
+                right_gate_sigmoid = triton_sigmoid(accumulator4)
+                accumulator_d = triton_sigmoid(accumulator_d)
+                # Apply elementwise multiplication with gated values
+                # C1 = left * left_gate, C2 = right * right_gate
+                accumulator1 = accumulator1 * left_gate_sigmoid  # left * left_gate
+                accumulator2 = accumulator2 * right_gate_sigmoid  # right * right_gate
+                # Convert to appropriate output dtype and store with normal tl.store
+                c1 = accumulator1.to(c1_ptr.dtype.element_ty)
+                c2 = accumulator2.to(c2_ptr.dtype.element_ty)
+                d = accumulator_d.to(d_ptr.dtype.element_ty)
+                tl.store(c1_ptrs, c1, mask=c_mask)
+                tl.store(c2_ptrs, c2, mask=c_mask)
+                tl.store(d_ptrs, d, mask=d_mask)
+    if torch.cuda.get_device_capability()[0] not in [9, 10.2]:
+        two_mm_kernel = triton.autotune(
+            (two_mm_kernel_configs_wrapper())(), key=["M", "N", "K"]
+        )(two_mm_kernel)
+    return two_mm_kernel
+def two_mm(A, left_proj, right_proj, left_gate, right_gate, out_gate, mask):
+    """
+    Persistent matrix multiplication for all weight matrices using on-device TMA descriptors.
+    Args:
+        A: [..., K] tensor (arbitrary leading dimensions)
+        left_proj: [N, K] matrix (will be transposed)
+        right_proj: [N, K] matrix (will be transposed)
+        left_gate: [N, K] left gate weight matrix
+        right_gate: [N, K] right gate weight matrix
+        out_gate: [N, K] output gate weight matrix
+        mask: mask tensor
+    Returns:
+        (C1, C2, D): Tuple of result tensors [..., N] with same leading dims as A
+            C1 = (A @ left_proj.T) * sigmoid(A @ left_gate.T) (masked)
+            C2 = (A @ right_proj.T) * sigmoid(A @ right_gate.T) (masked)
+            D = sigmoid(A @ out_gate.T) (unmasked)
+    """
+    # Check constraints
+    assert A.shape[-1] == left_proj.shape[1] == right_proj.shape[1], "Incompatible K dimensions"
+    assert A.dtype == left_proj.dtype == right_proj.dtype, "Incompatible dtypes"
+    # Assert that all weight matrices have the same strides (same [N, K] shape)
+    assert left_proj.stride() == right_proj.stride() == left_gate.stride() == right_gate.stride() == out_gate.stride(), \
+        "All weight matrices must have identical strides"
+    # Get dimensions
+    original_shape = A.shape[:-1]  # All dimensions except the last
+    K = A.shape[-1]
+    N = left_proj.shape[0]
+    B, seq_len, _, _ = A.shape
+    dtype = A.dtype
+    # Flatten A to 2D for kernel processing
+    A_2d = A.view(-1, K)  # [M, K] where M is product of all leading dims
+    M = A_2d.shape[0]
+    # Get number of streaming multiprocessors
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+    # Launch persistent kernel with limited number of blocks
+    grid = lambda META: (min(NUM_SMS, triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"])),)
+    # Get original 4D strides for A and output tensors
+    A_strides = A.stride()  # (stride_0, stride_1, stride_2, stride_3)
+    # Create output tensors with proper 4D shape to get correct strides
+    output_shape = original_shape + (N,)
+    # C1 = torch.empty(output_shape, device=A.device, dtype=dtype)
+    # C2 = torch.empty(output_shape, device=A.device, dtype=dtype)
+    C1 = torch.empty(B, N, seq_len, seq_len, device=A.device, dtype=torch.float16).permute(0, 2, 3, 1)
+    C2 = torch.empty(B, N, seq_len, seq_len, device=A.device, dtype=torch.float16).permute(0, 2, 3, 1)
+    D = torch.empty(output_shape, device=A.device, dtype=torch.float16)
+    C_strides = C1.stride()  # (stride_0, stride_1, stride_2, stride_3)
+    D_strides = D.stride()   # (stride_0, stride_1, stride_2, stride_3)
+    # Use optimal configuration for B200/H100 or fallback to autotuning for other GPUs
+    if torch.cuda.get_device_capability()[0] == 10:
+        # Get optimal configuration for B200
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps = (two_mm_kernel_configs_wrapper())(B, seq_len, K)
+        grid_size = min(NUM_SMS, triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N))
+        two_mm_kernel_wrapper()[(grid_size,)](
+            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
+            C1, C2, D, mask,
+            M, N, K,
+            *A_strides,  # 4D strides for A
+            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
+            *C_strides,  # 4D strides for C
+            seq_len,
+            *D_strides,  # 4D strides for D
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_SIZE_M=8, NUM_SMS=NUM_SMS,
+            num_stages=num_stages, num_warps=num_warps
+        )
+    elif torch.cuda.get_device_capability()[0] == 9:
+        # Get optimal configuration for H100
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps = (two_mm_kernel_configs_wrapper())(B, seq_len, K)
+        grid_size = min(NUM_SMS, triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N))
+        two_mm_kernel_wrapper()[(grid_size,)](
+            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
+            C1, C2, D, mask,
+            M, N, K,
+            *A_strides,  # 4D strides for A
+            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
+            *C_strides,  # 4D strides for C
+            seq_len,
+            *D_strides,  # 4D strides for D
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_SIZE_M=8, NUM_SMS=NUM_SMS,
+            num_stages=num_stages, num_warps=num_warps
+        )
+    else:
+        # Use autotuning for other GPUs
+        two_mm_kernel_wrapper()[grid](
+            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
+            C1, C2, D, mask,
+            M, N, K,
+            *A_strides,  # 4D strides for A
+            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
+            *C_strides,  # 4D strides for C
+            seq_len,
+            *D_strides,  # 4D strides for D
+            NUM_SMS=NUM_SMS
+        )
+    return C1, C2, D
+def second_layernorm_mul(inp, hidden_dim, weight, bias, mul_operand):
+    ln = torch.nn.functional.layer_norm(inp, (hidden_dim,), eps=1e-5, weight=weight.to(inp.dtype), bias=bias.to(inp.dtype))
+    out = ln * mul_operand
+    return out
+'''
+@triton.autotune(
+    [triton.Config({"ROW_BLOCK_SIZE": 16}, num_warps=4, num_stages=3)],
+    key=["R", "C"]
+)
+'''
+@triton.jit
+def layernorm_kernel_first(
+    X,
+    Y,
+    Weight,
+    Bias,
+    R,
+    C,  # aka "dim"
+    eps,
+    ROW_BLOCK_SIZE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row = tl.program_id(0) * ROW_BLOCK_SIZE + tl.arange(0, ROW_BLOCK_SIZE)
+    cols = tl.arange(0, BLOCK_SIZE)
+    mask_row = row < R
+    mask_col = cols < C
+    # Simple indexing for contiguous data
+    x = tl.load(
+        X + row[:, None] * C + cols[None, :],
+        mask=mask_row[:, None] & mask_col[None, :],
+        other=0.0
+    ).to(tl.float32)
+    weight = tl.load(Weight + cols, mask=mask_col, other=0.0).to(tl.float32)
+    bias = tl.load(Bias + cols, mask=mask_col, other=0.0).to(tl.float32)
+    mean = tl.sum(x, axis=1) / C
+    diff = tl.where(mask_row[:, None] & mask_col[None, :], x - mean[:, None], 0)
+    var = tl.sum(diff * diff, axis=1) / C
+    rstd = 1 / tl.sqrt(var + eps)
+    y_hat = (x - mean[:, None]) * rstd[:, None]
+    y = y_hat * weight[None, :] + bias[None, :]
+    tl.store(
+        Y + row[:, None] * C + cols[None, :],
+        y,
+        mask=mask_row[:, None] & mask_col[None, :]
+    )
+def get_optimal_config_ln(dim):
+    config = None
+    if torch.cuda.get_device_capability()[0] == 9:
+        if (dim <= 256):
+            config = (16, 1)
+        elif dim <= 512:
+            config = (16, 2)
+        elif dim <= 1024:
+            config = (16, 4)
+    if not config:
+        config = (16, 4)
+    return config
+def triton_layernorm_first(x, weight, bias, eps=1e-5, num_warps=None, ROW_BLOCK_SIZE=None):
+    B, seq_len, seq_len2, dim = x.shape
+    assert(seq_len == seq_len2)
+    R = B * seq_len * seq_len
+    C = dim
+    out = torch.empty_like(x, dtype=torch.float16)
+    if not num_warps or not ROW_BLOCK_SIZE:
+        ROW_BLOCK_SIZE, num_warps = get_optimal_config_ln(dim)
+    BLOCK_SIZE = triton.next_power_of_2(C)
+    assert(BLOCK_SIZE <= 1024)
+    def grid(meta):
+        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
+    layernorm_kernel_first[grid](
+        x, out, weight, bias,
+        R, C, eps,
+        ROW_BLOCK_SIZE=ROW_BLOCK_SIZE,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+        num_stages=3
+    )
+    return out
+'''
+def triton_layernorm_first(x, weight, bias, eps=1e-5):
+    B, seq_len, seq_len2, dim = x.shape
+    assert(seq_len == seq_len2)
+    R = B * seq_len * seq_len
+    C = dim
+    out = torch.empty_like(x)
+    BLOCK_SIZE = triton.next_power_of_2(C)
+    assert(BLOCK_SIZE <= 1024)
+    def grid(meta):
+        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
+    layernorm_kernel_first[grid](
+        x, out, weight, bias,
+        R, C, eps,
+        BLOCK_SIZE=BLOCK_SIZE
+    )
+    return out
+'''
+@triton.autotune(
+    [triton.Config({"ROW_BLOCK_SIZE": 16}, num_warps=1, num_stages=3)],
+    key=[]
+)
+@triton.jit
+def layernorm_kernel_eltwise(
+    X,
+    Y,
+    Weight,
+    Bias,
+    OutGate,
+    seq_len,
+    stride_batch,
+    stride_dim,
+    R,
+    C,  # aka "dim"
+    eps,
+    ROW_BLOCK_SIZE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    row = tl.program_id(0) * ROW_BLOCK_SIZE + tl.arange(0, ROW_BLOCK_SIZE)
+    cols = tl.arange(0, BLOCK_SIZE)
+    # Calculate base pointer for this batch of rows
+    tl.device_assert(seq_len*seq_len % ROW_BLOCK_SIZE == 0)
+    # batch_offset = (row // (stride_seq1 // stride_dim)) * stride_batch
+    batch = tl.program_id(0) * ROW_BLOCK_SIZE // (seq_len * seq_len)
+    seqs_off = row % (seq_len * seq_len) # TODO is this going to prevent vectorization
+    off_r = batch * stride_batch + seqs_off
+    off_c = cols * stride_dim
+    mask_row = row < R
+    mask_col = cols < C
+    out_gate = tl.load(
+        OutGate + row[:, None] * C + cols[None, :],
+        mask = mask_row[:, None] & mask_col[None, :],
+    )
+    x = tl.load(
+        X + off_r[:, None] + off_c[None, :],
+        mask=mask_row[:, None] & mask_col[None, :],
+        other=0.0
+    ).to(tl.float32)
+    weight = tl.load(Weight + cols, mask=mask_col, other=0.0).to(tl.float32)
+    bias = tl.load(Bias + cols, mask=mask_col, other=0.0).to(tl.float32)
+    mean = tl.sum(x, axis=1) / C
+    diff = tl.where(mask_row[:, None] & mask_col[None, :], x - mean[:, None], 0)
+    var = tl.sum(diff * diff, axis=1) / C
+    rstd = 1 / tl.sqrt(var + eps)
+    y_hat = (x - mean[:, None]) * rstd[:, None]
+    y = y_hat * weight[None, :] + bias[None, :]
+    tl.store(
+        Y + row[:, None] * C + cols[None, :],
+        y * out_gate,
+        mask=mask_row[:, None] & mask_col[None, :]
+    )
+def triton_layernorm_eltwise(x, weight, bias, out_gate, eps=1e-5):
+    B, seq_len, seq_len2, dim = x.shape
+    assert(seq_len == seq_len2)
+    R = B * seq_len * seq_len
+    assert(x.stride(3) == seq_len*seq_len)
+    assert(out_gate.is_contiguous())
+    C = dim
+    out = torch.empty_like(out_gate, dtype=torch.float32)
+    BLOCK_SIZE = triton.next_power_of_2(C)
+    assert(BLOCK_SIZE == 128)
+    def grid(meta):
+        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
+    layernorm_kernel_eltwise[grid](
+        x, out, weight, bias, out_gate,
+        seq_len,
+        x.stride(0), x.stride(3),
+        R, C, eps,
+        BLOCK_SIZE=BLOCK_SIZE
+    )
+    return out
+def kernel_global(data: input_t) -> output_t:
+    """
+    Reference implementation of TriMul using PyTorch.
+    Args:
+        data: Tuple of (input: torch.Tensor, mask: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
+            - input: Input tensor of shape [batch_size, seq_len, seq_len, dim]
+            - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
+            - weights: Dictionary containing model weights
+            - config: Dictionary containing model configuration parameters
+    """
+    input_tensor, mask, weights, config = data
+    left_proj_weight = weights["left_proj.weight"].to(torch.float16)
+    right_proj_weight = weights["right_proj.weight"].to(torch.float16)
+    left_gate_weight = weights["left_gate.weight"].to(torch.float16)
+    right_gate_weight = weights["right_gate.weight"].to(torch.float16)
+    out_gate_weight = weights["out_gate.weight"].to(torch.float16)
+    hidden_dim = config["hidden_dim"]
+    # trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
+    x = input_tensor
+    batch_size, seq_len, _, dim = x.shape
+    x = triton_layernorm_first(x, weights['norm.weight'], weights['norm.bias'])
+    # x = torch.nn.functional.layer_norm(x, (dim,), eps=1e-5, weight=weights['norm.weight'], bias=weights['norm.bias'])
+    left, right, out_gate = two_mm(x, left_proj_weight, right_proj_weight, left_gate_weight, right_gate_weight, out_gate_weight, mask)
+    # left = torch.nn.functional.linear(x, weights['left_proj.weight'].to(torch.float16))
+    # right = torch.nn.functional.linear(x, weights['right_proj.weight'].to(torch.float16))
+    # left = left * mask.unsqueeze(-1)
+    # right = right * mask.unsqueeze(-1)
+    '''
+    left = left.to(torch.float32)
+    right = right.to(torch.float32)
+    x = x.to(torch.float32)
+    left_gate = left_gate.sigmoid()
+    right_gate = right_gate.sigmoid()
+    out_gate = out_gate.sigmoid()
+    '''
+    # Elementwise multiplication now handled in kernel
+    # left = left * left_gate
+    # right = right * right_gate
+    # out = einsum('... i k d, ... j k d -> ... i j d', left, right)
+    out = torch.bmm(left.permute(0, 3, 1, 2).view(-1, left.shape[1], left.shape[2]), right.permute(0, 3, 2, 1).view(-1, right.shape[2], right.shape[1]))
+    out = out.view(batch_size, hidden_dim, seq_len, seq_len).permute(0, 2, 3, 1)
+    # out = torch.compile(second_layernorm_mul, dynamic=False)(out, hidden_dim, weights['to_out_norm.weight'], weights['to_out_norm.bias'], out_gate)
+    out = triton_layernorm_eltwise(out, weights['to_out_norm.weight'], weights['to_out_norm.bias'], out_gate)
+    # out = torch.nn.functional.layer_norm(out, (hidden_dim,), eps=1e-5, weight=weights['to_out_norm.weight'].to(out.dtype), bias=weights['to_out_norm.bias'].to(out.dtype))
+    # out = out * out_gate
+    return torch.nn.functional.linear(out, weights['to_out.weight'])
+    '''
+    # Fill in the given weights of the model
+    trimul.norm.weight = nn.Parameter(weights['norm.weight'])
+    trimul.norm.bias = nn.Parameter(weights['norm.bias'])
+    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
+    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
+    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
+    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
+    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
+    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
+    trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'])
+    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
+    output = trimul(input_tensor, mask)
+    return output
+    '''

build/torch-xpu/trimul_gpumode/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import sys
+import importlib
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch-xpu/trimul_mi300.py ADDED Viewed

	@@ -0,0 +1,524 @@

+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16},  num_warps=4, num_stages=2),
+        # Configurations with larger block sizes for better data reuse
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=2),
+        # Configurations with deeper K dimension
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=2),
+        # More extreme configurations to test the limits
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=4, num_stages=2),
+        # Configurations with fewer warps
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=2, num_stages=2),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=3),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def fused_ln_dual_matmul_kernel(
+    # Pointers (9)
+    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
+    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
+    # Metadata (5)
+    M, H, K, s1, s2,
+    # Strides (16)
+    stride_x_m, stride_x_k,
+    stride_w4_k, stride_w4_n,
+    stride_wog_k, stride_wog_n,
+    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
+    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
+    stride_og_m, stride_og_h,
+    stride_mask_m, stride_mask_h,
+    # Constexpr (from decorator and kwargs)
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
+):
+    # --- PID Mapping: Based on the LARGER 4*H problem ---
+    pid = tl.program_id(axis=0)
+    N_4way = 4 * H
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # --- SHARED LayerNorm calculation (done only ONCE) ---
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    m_mask = offs_m < M
+    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
+    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        mean += tl.sum(x_chunk, axis=1)
+    mean /= K
+    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        x_centered = x_chunk - mean[:, None]
+        var += tl.sum(x_centered * x_centered, axis=1)
+    var /= K
+    rstd = 1.0 / tl.sqrt(var + LN_EPS)
+    # --- Matmul Loop 1: For the 4-Way Projections ---
+    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
+    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_block_start = k * BLOCK_SIZE_K;
+        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
+        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
+        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
+        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
+        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
+        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
+        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
+        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
+        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+        accumulator_4way += tl.dot(x_norm_tile, w_tile)
+        #Some threads should calclate out_gate
+        if pid_n * BLOCK_SIZE_N < H:
+            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
+            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
+            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
+            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+            accumulator_og += tl.dot(x_norm_tile, w_tile)
+    if pid_n * BLOCK_SIZE_N < H:
+        og_out = tl.sigmoid(accumulator_og)
+        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
+        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
+        tl.store(outg_ptrs, og_out, mask=og_mask)
+    # --- Fusion Logic for 4-Way Part ---
+    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
+    role_idx = tl.arange(0, 4)[None, None, :]
+    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
+    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
+    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
+    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
+    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
+    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
+    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
+    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
+    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
+    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
+    s1s2 = s1 * s2
+    offs_b  = offs_m // s1s2
+    offs_s1 = (offs_m % s1s2) // s2
+    offs_s2 = offs_m % s2
+    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
+    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
+    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
+    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
+    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
+                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
+    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
+                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1) # s2 offset uses s2 stride, s1 offset uses s1 stride
+    tl.store(outl_ptrs, left_out, mask=m_mask_h)
+    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
+    ],
+    key=['s1', 's2', 'H'],
+)
+@triton.jit
+def bmm_coalesced_kernel(
+    # Pointers
+    Left_ptr, Right_ptr, Out_ptr,
+    # Dimensions
+    bs, s1, s2, H,
+    # Strides
+    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
+    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
+    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
+    # Kernel parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # Grid and program IDs
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_bh = tl.program_id(axis=1)
+    pid_b = pid_bh // H
+    pid_h = pid_bh % H
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
+    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
+        k_start = k * BLOCK_SIZE_K
+        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
+        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
+        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
+        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        accumulator += tl.dot(a, b)
+    # --- Coalesced Write ---
+    # Write to a standard (bs, H, s1, s1) layout
+    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
+               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
+    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
+    tl.store(out_ptrs, accumulator, mask=c_mask)
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
+    ],
+    key=['H', 'D'],
+)
+@triton.jit
+def fused_final_kernel(
+    # Pointers
+    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
+    # Metadata
+    M, H, D, s1, # M_gate = bs*s1*s2
+    # Strides
+    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
+    stride_gate_m, stride_gate_h,
+    stride_proj_d, stride_proj_h,
+    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
+    # Constants
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # --- Grid and PID Setup for Matmul ---
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    m_mask = offs_m < M
+    # Decompose M back to (b, r, c) for reordering lookups
+    s1s1 = s1 * s1
+    b = offs_m // s1s1
+    r = (offs_m % s1s1) // s1
+    c = offs_m % s1
+    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
+        # Accumulate sum and sum of squares in one pass
+        sum_x += tl.sum(in_chunk, axis=1)
+        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
+    # Finalize statistics
+    mean = sum_x / H
+    var = (sum_x2 / H) - (mean * mean)
+    rstd = tl.math.rsqrt(var + LN_EPS)
+    # --- Pass 3: Fused Gating and Matmul ---
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_norm = (a - mean[:, None]) * rstd[:, None]
+        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
+        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
+        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
+        proj_ptrs = ProjW_ptr + \
+                    offs_n[None, :] * stride_proj_d + \
+                    offs_k[:, None] * stride_proj_h
+        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
+        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_gated = a_norm * gate
+        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
+        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
+    # --- Store Final Output ---
+    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
+    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
+    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
+def compiledtrimul_fused_interleaved(
+    x: torch.Tensor,
+    mask_mh: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    W_4way: torch.Tensor, # Use the new weight matrices
+    W_og: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int,
+):
+    bs, s1, s2, d = x.shape
+    M, K, H = bs * s1 * s2, x.shape[-1], h
+    x_flat = x.view(M, K)
+    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
+    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
+    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
+    # The grid is launched for the larger 4*H problem
+    N_4way = 4 * H
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
+    fused_ln_dual_matmul_kernel[grid](
+        # Pointers (9)
+        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
+        left_final, right_final_t, og_mh,
+        # Metadata (5) - M, H, K, s1, s2
+        M, H, K, s1, s2,
+        # Strides (16)
+        x_flat.stride(0), x_flat.stride(1),
+        W_4way.stride(0), W_4way.stride(1),
+        W_og.stride(0), W_og.stride(1),
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        og_mh.stride(0), og_mh.stride(1),
+        mask_mh.stride(0), mask_mh.stride(1),
+        # Constexpr (1)
+        LN_EPS=1e-5
+    )
+    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
+    grid_bmm = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
+    bmm_coalesced_kernel[grid_bmm](
+        left_final, right_final_t, bmm_out_tmp,
+        bs, s1, s2, H,
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+    )
+    # --- Kernel 3: Fully Fused Final Stage ---
+    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
+    grid_final = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
+    fused_final_kernel[grid_final](
+        # Pointers
+        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
+        # Metadata
+        M, H, d, s1,
+        # Strides
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        og_mh.stride(0), og_mh.stride(1),
+        to_out_weight.stride(0), to_out_weight.stride(1), # Use strides of the corrected tensor
+        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
+        # Constants
+        LN_EPS=1e-5,
+    )
+    return final_out
+def pack_w_4way_efficient(weights):
+    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
+    WL = weights['left_proj.weight']
+    WLG = weights['left_gate.weight']
+    WR = weights['right_proj.weight']
+    WRG = weights['right_gate.weight']
+    H, K = WL.shape
+    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2)
+    ws = ws.contiguous().view(4 * H, K)
+    return ws.t().to(torch.float16)
+def get_w_og(weights):
+    """ Gets the transposed [K, H] out_gate weight matrix. """
+    WOG = weights['out_gate.weight']
+    return WOG.t().to(torch.float16)
+def compiledtrimul(
+    x: torch.Tensor,
+    mask: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    w_concat: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int
+) -> torch.Tensor:
+    """
+    A barebones, compiled PyTorch function for the TriMul logic.
+    """
+    bs, s1, s2, d = x.shape
+    # Initial LayerNorm
+    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
+    # Single large matmul: [M, d] @ [d, 5h] = [M, 5h]
+    all_projections = torch.mm(x_norm, w_concat)
+    # Split back into individual projections
+    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
+    # Apply mask and gates
+    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
+    left = left * mask_expanded * torch.sigmoid(lg)
+    right = right * mask_expanded * torch.sigmoid(rg)
+    out_gate = torch.sigmoid(og)
+    # Reshape for einsum
+    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
+    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
+    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
+    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
+    # Apply layer norm and final gating
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * out_gate
+    # Final projection
+    final_out_flat = gated @ to_out_weight.t()
+    final_out = final_out_flat.view(bs, s1, s2, d)
+    return final_out
+def small_kernel_pt_path(data):
+    input_tensor, mask, weights, config = data
+    w_concat = torch.cat([
+        weights['left_proj.weight'],
+        weights['right_proj.weight'],
+        weights['left_gate.weight'],
+        weights['right_gate.weight'],
+        weights['out_gate.weight']
+    ], dim=0).t().contiguous().to(torch.float16)
+    # Call the compiled function with prepared weights
+    output = compiledtrimul(
+        x=input_tensor.to(torch.float32),
+        mask=mask.unsqueeze(-1),
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        w_concat=w_concat,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=config["hidden_dim"]
+    )
+    return output
+def kernel_mi300(data):
+    input_tensor, mask, weights, config = data
+    bs, s1, s2, d = input_tensor.shape
+    if s1 < 100:
+        return small_kernel_pt_path(data)
+    H = config["hidden_dim"]
+    W_4way = pack_w_4way_efficient(weights)
+    W_og = get_w_og(weights)
+    M = bs * s1 * s2
+    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16) #move into kernel possibly
+    return compiledtrimul_fused_interleaved(
+        x=input_tensor.to(torch.float32),
+        mask_mh=mask_mh,
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        W_4way=W_4way, # Pass the new 4-way matrix
+        W_og=W_og,     # Pass the new out_gate matrix
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=H,
+    )

build/torch-xpu/triton_a100.py ADDED Viewed

	@@ -0,0 +1,405 @@

+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+# Set PyTorch flags for performance
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@triton.jit
+def fused_ln_dual_matmul_kernel(
+    # Pointers (9)
+    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
+    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
+    # Metadata (5)
+    M, H, K, s1, s2,
+    # Strides (16)
+    stride_x_m, stride_x_k,
+    stride_w4_k, stride_w4_n,
+    stride_wog_k, stride_wog_n,
+    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
+    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
+    stride_og_m, stride_og_h,
+    stride_mask_m, stride_mask_h,
+    # Constexpr (now passed as arguments from the host)
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
+):
+    # --- PID Mapping: Based on the LARGER 4*H problem ---
+    pid = tl.program_id(axis=0)
+    N_4way = 4 * H
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # --- SHARED LayerNorm calculation (done only ONCE) ---
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    m_mask = offs_m < M
+    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
+    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        mean += tl.sum(x_chunk, axis=1)
+    mean /= K
+    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        x_centered = x_chunk - mean[:, None]
+        var += tl.sum(x_centered * x_centered, axis=1)
+    var /= K
+    rstd = 1.0 / tl.sqrt(var + LN_EPS)
+    # --- Matmul Loop 1: For the 4-Way Projections ---
+    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
+    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_block_start = k * BLOCK_SIZE_K;
+        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
+        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
+        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
+        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
+        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
+        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
+        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
+        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
+        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+        accumulator_4way += tl.dot(x_norm_tile, w_tile)
+        if pid_n * BLOCK_SIZE_N < H:
+            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
+            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
+            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
+            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+            accumulator_og += tl.dot(x_norm_tile, w_tile)
+    if pid_n * BLOCK_SIZE_N < H:
+        og_out = tl.sigmoid(accumulator_og)
+        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
+        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
+        tl.store(outg_ptrs, og_out, mask=og_mask)
+    # --- Fusion Logic for 4-Way Part ---
+    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
+    role_idx = tl.arange(0, 4)[None, None, :]
+    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
+    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
+    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
+    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
+    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
+    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
+    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
+    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
+    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
+    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
+    s1s2 = s1 * s2
+    offs_b  = offs_m // s1s2
+    offs_s1 = (offs_m % s1s2) // s2
+    offs_s2 = offs_m % s2
+    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
+    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
+    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
+    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
+    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
+                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
+    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
+                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1)
+    tl.store(outl_ptrs, left_out, mask=m_mask_h)
+    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
+@triton.jit
+def bmm_coalesced_kernel(
+    # Pointers
+    Left_ptr, Right_ptr, Out_ptr,
+    # Dimensions
+    bs, s1, s2, H,
+    # Strides
+    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
+    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
+    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
+    # Kernel parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_bh = tl.program_id(axis=1)
+    pid_b = pid_bh // H
+    pid_h = pid_bh % H
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
+    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
+        k_start = k * BLOCK_SIZE_K
+        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
+        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
+        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
+        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        accumulator += tl.dot(a, b)
+    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
+               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
+    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
+    tl.store(out_ptrs, accumulator, mask=c_mask)
+@triton.jit
+def fused_final_kernel(
+    # Pointers
+    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
+    # Metadata
+    M, H, D, s1,
+    # Strides
+    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
+    stride_gate_m, stride_gate_h,
+    stride_proj_d, stride_proj_h,
+    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
+    # Constants
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    m_mask = offs_m < M
+    s1s1 = s1 * s1
+    b = offs_m // s1s1
+    r = (offs_m % s1s1) // s1
+    c = offs_m % s1
+    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
+        sum_x += tl.sum(in_chunk, axis=1)
+        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
+    mean = sum_x / H
+    var = (sum_x2 / H) - (mean * mean)
+    rstd = tl.math.rsqrt(var + LN_EPS)
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_norm = (a - mean[:, None]) * rstd[:, None]
+        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
+        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
+        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
+        proj_ptrs = ProjW_ptr + offs_n[None, :] * stride_proj_d + offs_k[:, None] * stride_proj_h
+        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
+        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_gated = a_norm * gate
+        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
+        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
+    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
+    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
+    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
+def compiledtrimul_fused_interleaved_final(
+    x: torch.Tensor,
+    mask_mh: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    W_4way: torch.Tensor,
+    W_og: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int,
+):
+    bs, s1, s2, d = x.shape
+    M, K, H = bs * s1 * s2, x.shape[-1], h
+    x_flat = x.view(M, K)
+    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
+    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
+    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
+    # --- Kernel 1: Fused LN + Dual Matmul ---
+    N_4way = 4 * H
+    # Hardcoded A100 best config: M128-N128-K32-GM8-HC32-W8-S2
+    config_k1 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}
+    grid_k1 = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
+    fused_ln_dual_matmul_kernel[grid_k1](
+        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
+        left_final, right_final_t, og_mh,
+        M, H, K, s1, s2,
+        x_flat.stride(0), x_flat.stride(1), W_4way.stride(0), W_4way.stride(1),
+        W_og.stride(0), W_og.stride(1), left_final.stride(0), left_final.stride(1),
+        left_final.stride(2), left_final.stride(3), right_final_t.stride(0), right_final_t.stride(1),
+        right_final_t.stride(2), right_final_t.stride(3), og_mh.stride(0), og_mh.stride(1),
+        mask_mh.stride(0), mask_mh.stride(1),
+        LN_EPS=1e-5, **config_k1, num_warps=8, num_stages=2
+    )
+    # --- Kernel 2: Batched Matrix Multiplication ---
+    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
+    # Hardcoded A100 best config: M128-N64-K32-GM8-W4-S3
+    config_k2 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
+    grid_k2 = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
+    bmm_coalesced_kernel[grid_k2](
+        left_final, right_final_t, bmm_out_tmp,
+        bs, s1, s2, H,
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        **config_k2, num_warps=4, num_stages=3
+    )
+    # --- Kernel 3: Fully Fused Final Stage ---
+    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
+    # Hardcoded A100 best config: M32-N128-K32-GM8-W4-S3
+    config_k3 = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
+    grid_k3 = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
+    fused_final_kernel[grid_k3](
+        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
+        M, H, d, s1,
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        og_mh.stride(0), og_mh.stride(1), to_out_weight.stride(0), to_out_weight.stride(1),
+        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
+        LN_EPS=1e-5, **config_k3, num_warps=4, num_stages=3
+    )
+    return final_out
+def pack_w_4way_efficient(weights):
+    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
+    WL, WLG, WR, WRG = (weights[k] for k in ['left_proj.weight', 'left_gate.weight', 'right_proj.weight', 'right_gate.weight'])
+    H, K = WL.shape
+    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2).contiguous().view(4 * H, K)
+    return ws.t().to(torch.float16)
+def get_w_og(weights):
+    """ Gets the transposed [K, H] out_gate weight matrix. """
+    return weights['out_gate.weight'].t().to(torch.float16)
+@torch.compile()
+def compiledtrimul(
+    x: torch.Tensor, mask: torch.Tensor, norm_weight: torch.Tensor, norm_bias: torch.Tensor,
+    w_concat: torch.Tensor, to_out_norm_weight: torch.Tensor, to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor, h: int
+) -> torch.Tensor:
+    bs, s1, s2, d = x.shape
+    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
+    all_projections = torch.mm(x_norm, w_concat)
+    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
+    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
+    left = left * mask_expanded * torch.sigmoid(lg)
+    right = right * mask_expanded * torch.sigmoid(rg)
+    out_gate = torch.sigmoid(og)
+    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
+    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
+    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
+    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * out_gate
+    final_out_flat = gated @ to_out_weight.t()
+    return final_out_flat.view(bs, s1, s1, d)
+def small_kernel_pt_path(data):
+    input_tensor, mask, weights, config = data
+    w_concat = torch.cat([
+        weights['left_proj.weight'], weights['right_proj.weight'], weights['left_gate.weight'],
+        weights['right_gate.weight'], weights['out_gate.weight']
+    ], dim=0).t().contiguous().to(torch.float16)
+    return compiledtrimul(
+        x=input_tensor.to(torch.float32), mask=mask.unsqueeze(-1),
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32), w_concat=w_concat,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=config["hidden_dim"]
+    )
+def kernel_a100(data):
+    input_tensor, mask, weights, config = data
+    bs, s1, s2, d = input_tensor.shape
+    if s1 < 512: # Adjusted threshold based on observed BMM configs
+        return small_kernel_pt_path(data)
+    H = config["hidden_dim"]
+    W_4way = pack_w_4way_efficient(weights)
+    W_og = get_w_og(weights)
+    M = bs * s1 * s2
+    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16)
+    return compiledtrimul_fused_interleaved_final(
+        x=input_tensor.to(torch.float32),
+        mask_mh=mask_mh,
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        W_4way=W_4way,
+        W_og=W_og,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=H,
+    )

build/torch-xpu/triton_b200.py ADDED Viewed

	@@ -0,0 +1,411 @@

+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@triton.jit
+def fused_ln_dual_matmul_kernel(
+    # Pointers (9)
+    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
+    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
+    # Metadata (5)
+    M, H, K, s1, s2,
+    # Strides (16)
+    stride_x_m, stride_x_k,
+    stride_w4_k, stride_w4_n,
+    stride_wog_k, stride_wog_n,
+    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
+    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
+    stride_og_m, stride_og_h,
+    stride_mask_m, stride_mask_h,
+    # Constexpr (now passed as arguments from the host)
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
+):
+    # --- PID Mapping: Based on the LARGER 4*H problem ---
+    pid = tl.program_id(axis=0)
+    N_4way = 4 * H
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # --- SHARED LayerNorm calculation (done only ONCE) ---
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    m_mask = offs_m < M
+    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
+    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        mean += tl.sum(x_chunk, axis=1)
+    mean /= K
+    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        x_centered = x_chunk - mean[:, None]
+        var += tl.sum(x_centered * x_centered, axis=1)
+    var /= K
+    rstd = 1.0 / tl.sqrt(var + LN_EPS)
+    # --- Matmul Loop 1: For the 4-Way Projections ---
+    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
+    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_block_start = k * BLOCK_SIZE_K;
+        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
+        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
+        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
+        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
+        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
+        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
+        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
+        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
+        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+        accumulator_4way += tl.dot(x_norm_tile, w_tile)
+        #Some threads should calclate out_gate
+        if pid_n * BLOCK_SIZE_N < H:
+            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
+            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
+            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
+            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+            accumulator_og += tl.dot(x_norm_tile, w_tile)
+    if pid_n * BLOCK_SIZE_N < H:
+        og_out = tl.sigmoid(accumulator_og)
+        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
+        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
+        tl.store(outg_ptrs, og_out, mask=og_mask)
+    # --- Fusion Logic for 4-Way Part ---
+    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
+    role_idx = tl.arange(0, 4)[None, None, :]
+    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
+    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
+    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
+    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
+    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
+    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
+    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
+    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
+    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
+    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
+    s1s2 = s1 * s2
+    offs_b  = offs_m // s1s2
+    offs_s1 = (offs_m % s1s2) // s2
+    offs_s2 = offs_m % s2
+    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
+    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
+    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
+    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
+    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
+                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
+    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
+                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1)
+    tl.store(outl_ptrs, left_out, mask=m_mask_h)
+    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
+@triton.jit
+def bmm_coalesced_kernel(
+    # Pointers
+    Left_ptr, Right_ptr, Out_ptr,
+    # Dimensions
+    bs, s1, s2, H,
+    # Strides
+    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
+    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
+    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
+    # Kernel parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # Grid and program IDs
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_bh = tl.program_id(axis=1)
+    pid_b = pid_bh // H
+    pid_h = pid_bh % H
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
+    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
+        k_start = k * BLOCK_SIZE_K
+        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
+        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
+        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
+        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        accumulator += tl.dot(a, b)
+    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
+               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
+    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
+    tl.store(out_ptrs, accumulator, mask=c_mask)
+@triton.jit
+def fused_final_kernel(
+    # Pointers
+    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
+    # Metadata
+    M, H, D, s1,
+    # Strides
+    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
+    stride_gate_m, stride_gate_h,
+    stride_proj_d, stride_proj_h,
+    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
+    # Constants
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    m_mask = offs_m < M
+    s1s1 = s1 * s1
+    b = offs_m // s1s1
+    r = (offs_m % s1s1) // s1
+    c = offs_m % s1
+    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
+        sum_x += tl.sum(in_chunk, axis=1)
+        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
+    mean = sum_x / H
+    var = (sum_x2 / H) - (mean * mean)
+    rstd = tl.math.rsqrt(var + LN_EPS)
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_norm = (a - mean[:, None]) * rstd[:, None]
+        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
+        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
+        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
+        proj_ptrs = ProjW_ptr + offs_n[None, :] * stride_proj_d + offs_k[:, None] * stride_proj_h
+        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
+        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_gated = a_norm * gate
+        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
+        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
+    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
+    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
+    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
+def compiledtrimul_fused_interleaved_final(
+    x: torch.Tensor,
+    mask_mh: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    W_4way: torch.Tensor,
+    W_og: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int,
+):
+    bs, s1, s2, d = x.shape
+    M, K, H = bs * s1 * s2, x.shape[-1], h
+    x_flat = x.view(M, K)
+    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
+    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
+    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
+    # --- Kernel 1: Fused LN + Dual Matmul ---
+    # The grid is launched for the larger 4*H problem
+    N_4way = 4 * H
+    # Hardcoded best config from logs: M64-N128-K64-GM8-HC32-W4-S2
+    config_k1 = {'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
+    fused_ln_dual_matmul_kernel[grid](
+        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
+        left_final, right_final_t, og_mh,
+        M, H, K, s1, s2,
+        x_flat.stride(0), x_flat.stride(1), W_4way.stride(0), W_4way.stride(1),
+        W_og.stride(0), W_og.stride(1), left_final.stride(0), left_final.stride(1),
+        left_final.stride(2), left_final.stride(3), right_final_t.stride(0), right_final_t.stride(1),
+        right_final_t.stride(2), right_final_t.stride(3), og_mh.stride(0), og_mh.stride(1),
+        mask_mh.stride(0), mask_mh.stride(1),
+        LN_EPS=1e-5, **config_k1, num_warps=4, num_stages=2
+    )
+    # --- Kernel 2: Batched Matrix Multiplication ---
+    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
+    # Hardcoded best config from logs: M128-N128-K32-GM8-W8-S3
+    config_k2 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
+    grid_bmm = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
+    bmm_coalesced_kernel[grid_bmm](
+        left_final, right_final_t, bmm_out_tmp,
+        bs, s1, s2, H,
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        **config_k2, num_warps=8, num_stages=3
+    )
+    # --- Kernel 3: Fully Fused Final Stage ---
+    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
+    # Hardcoded best config from logs: M32-N128-K32-GM8-W4-S3
+    config_k3 = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
+    grid_final = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
+    fused_final_kernel[grid_final](
+        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
+        M, H, d, s1,
+        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
+        og_mh.stride(0), og_mh.stride(1), to_out_weight.stride(0), to_out_weight.stride(1),
+        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
+        LN_EPS=1e-5, **config_k3, num_warps=4, num_stages=3
+    )
+    return final_out
+def pack_w_4way_efficient(weights):
+    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
+    WL, WLG, WR, WRG = (weights[k] for k in ['left_proj.weight', 'left_gate.weight', 'right_proj.weight', 'right_gate.weight'])
+    H, K = WL.shape
+    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2).contiguous().view(4 * H, K)
+    return ws.t().to(torch.float16)
+def get_w_og(weights):
+    """ Gets the transposed [K, H] out_gate weight matrix. """
+    return weights['out_gate.weight'].t().to(torch.float16)
+@torch.compile()
+def compiledtrimul(
+    x: torch.Tensor, mask: torch.Tensor, norm_weight: torch.Tensor, norm_bias: torch.Tensor,
+    w_concat: torch.Tensor, to_out_norm_weight: torch.Tensor, to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor, h: int
+) -> torch.Tensor:
+    bs, s1, s2, d = x.shape
+    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
+    all_projections = torch.mm(x_norm, w_concat)
+    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
+    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
+    left = left * mask_expanded * torch.sigmoid(lg)
+    right = right * mask_expanded * torch.sigmoid(rg)
+    out_gate = torch.sigmoid(og)
+    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
+    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
+    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
+    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * out_gate
+    final_out_flat = gated @ to_out_weight.t()
+    return final_out_flat.view(bs, s1, s1, d)
+def small_kernel_pt_path(data):
+    input_tensor, mask, weights, config = data
+    w_concat = torch.cat([
+        weights['left_proj.weight'], weights['right_proj.weight'], weights['left_gate.weight'],
+        weights['right_gate.weight'], weights['out_gate.weight']
+    ], dim=0).t().contiguous().to(torch.float16)
+    return compiledtrimul(
+        x=input_tensor.to(torch.float32), mask=mask.unsqueeze(-1),
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32), w_concat=w_concat,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=config["hidden_dim"]
+    )
+def kernel_b200(data):
+    input_tensor, mask, weights, config = data
+    bs, s1, s2, d = input_tensor.shape
+    if s1 < 800:
+        return small_kernel_pt_path(data)
+    H = config["hidden_dim"]
+    W_4way = pack_w_4way_efficient(weights)
+    W_og = get_w_og(weights)
+    M = bs * s1 * s2
+    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16)
+    return compiledtrimul_fused_interleaved_final(
+        x=input_tensor.to(torch.float32),
+        mask_mh=mask_mh,
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        W_4way=W_4way,
+        W_og=W_og,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=H,
+    )

build/torch-xpu/triton_h100.py ADDED Viewed

	@@ -0,0 +1,509 @@

+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16},  num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=5),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=4, num_stages=5),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=2, num_stages=4),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def fused_ln_dual_matmul_kernel(
+    # Pointers (9)
+    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
+    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
+    # Metadata (5)
+    M, H, K, s1, s2,
+    # Strides (16)
+    stride_x_m, stride_x_k,
+    stride_w4_k, stride_w4_n,
+    stride_wog_k, stride_wog_n,
+    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
+    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
+    stride_og_m, stride_og_h,
+    stride_mask_m, stride_mask_h,
+    # Constexpr (from decorator and kwargs)
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
+):
+    # --- PID Mapping: Based on the LARGER 4*H problem ---
+    pid = tl.program_id(axis=0)
+    N_4way = 4 * H
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    # --- SHARED LayerNorm calculation (done only ONCE) ---
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    m_mask = offs_m < M
+    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
+    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        mean += tl.sum(x_chunk, axis=1)
+    mean /= K
+    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    for k_offset in range(0, K, BLOCK_SIZE_K):
+        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
+        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
+        k_mask = (k_offset + k_chunk_offs) < K
+        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        x_centered = x_chunk - mean[:, None]
+        var += tl.sum(x_centered * x_centered, axis=1)
+    var /= K
+    rstd = 1.0 / tl.sqrt(var + LN_EPS)
+    # --- Matmul Loop 1: For the 4-Way Projections ---
+    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
+    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        k_block_start = k * BLOCK_SIZE_K;
+        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
+        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
+        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
+        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
+        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
+        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
+        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
+        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
+        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
+        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
+        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+        accumulator_4way += tl.dot(x_norm_tile, w_tile)
+        #Some threads should calclate out_gate
+        if pid_n * BLOCK_SIZE_N < H:
+            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
+            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
+            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
+            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
+            accumulator_og += tl.dot(x_norm_tile, w_tile)
+    if pid_n * BLOCK_SIZE_N < H:
+        og_out = tl.sigmoid(accumulator_og)
+        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
+        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
+        tl.store(outg_ptrs, og_out, mask=og_mask)
+    # --- Fusion Logic for 4-Way Part ---
+    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
+    role_idx = tl.arange(0, 4)[None, None, :]
+    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
+    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
+    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
+    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
+    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
+    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
+    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
+    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
+    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
+    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
+    s1s2 = s1 * s2
+    offs_b  = offs_m // s1s2
+    offs_s1 = (offs_m % s1s2) // s2
+    offs_s2 = offs_m % s2
+    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
+    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
+    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
+    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
+    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
+                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
+    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
+                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1) # s2 offset uses s2 stride, s1 offset uses s1 stride
+    tl.store(outl_ptrs, left_out, mask=m_mask_h)
+    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+    ],
+    key=['s1', 's2', 'H'],
+)
+@triton.jit
+def bmm_coalesced_kernel(
+    # Pointers
+    Left_ptr, Right_ptr, Out_ptr,
+    # Dimensions
+    bs, s1, s2, H,
+    # Strides
+    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
+    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
+    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
+    # Kernel parameters
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # Grid and program IDs
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    pid_bh = tl.program_id(axis=1)
+    pid_b = pid_bh // H
+    pid_h = pid_bh % H
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
+    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
+        k_start = k * BLOCK_SIZE_K
+        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
+        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
+        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
+        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
+        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
+        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
+        accumulator += tl.dot(a, b)
+    # --- Coalesced Write ---
+    # Write to a standard (bs, H, s1, s1) layout
+    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
+               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
+    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
+    tl.store(out_ptrs, accumulator, mask=c_mask)
+@torch.compile
+def torch_pt2(left_final, right_final_t, bs, s1, s2, d, h, to_out_norm_weight, to_out_norm_bias, og_mh, to_out_weight):
+    bmm_out = torch.matmul(left_final, right_final_t)
+    out_einsum_flat = bmm_out.permute(0, 2, 3, 1).reshape(bs * s1 * s1, h)
+    # Apply layer norm and final gating
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * og_mh
+    # Final projection
+    final_out_flat = gated @ to_out_weight.t()
+    final_out = final_out_flat.view(bs, s1, s2, d)
+    return final_out
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
+    ],
+    key=['H', 'D'],
+)
+@triton.jit
+def fused_final_kernel(
+    # Pointers
+    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
+    # Metadata
+    M, H, D, s1, # M_gate = bs*s1*s2
+    # Strides
+    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
+    stride_gate_m, stride_gate_h,
+    stride_proj_d, stride_proj_h,
+    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
+    # Constants
+    LN_EPS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # --- Grid and PID Setup for Matmul ---
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    m_mask = offs_m < M
+    # Decompose M back to (b, r, c) for reordering lookups
+    s1s1 = s1 * s1
+    b = offs_m // s1s1
+    r = (offs_m % s1s1) // s1
+    c = offs_m % s1
+    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
+        # Accumulate sum and sum of squares in one pass
+        sum_x += tl.sum(in_chunk, axis=1)
+        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
+    # Finalize statistics
+    mean = sum_x / H
+    var = (sum_x2 / H) - (mean * mean)
+    rstd = tl.math.rsqrt(var + LN_EPS)
+    # --- Pass 3: Fused Gating and Matmul ---
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k_offset in range(0, H, BLOCK_SIZE_K):
+        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
+        k_mask = offs_k < H
+        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
+        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_norm = (a - mean[:, None]) * rstd[:, None]
+        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
+        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
+        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
+        proj_ptrs = ProjW_ptr + \
+                    offs_n[None, :] * stride_proj_d + \
+                    offs_k[:, None] * stride_proj_h
+        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
+        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
+        a_gated = a_norm * gate
+        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
+        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
+    # --- Store Final Output ---
+    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
+    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
+    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
+def compiledtrimul_fused_interleaved(
+    x: torch.Tensor,
+    mask_mh: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    W_4way: torch.Tensor, # Use the new weight matrices
+    W_og: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int,
+):
+    bs, s1, s2, d = x.shape
+    M, K, H = bs * s1 * s2, x.shape[-1], h
+    x_flat = x.view(M, K)
+    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
+    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
+    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
+    # The grid is launched for the larger 4*H problem
+    N_4way = 4 * H
+    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
+    fused_ln_dual_matmul_kernel[grid](
+        # Pointers (9)
+        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
+        left_final, right_final_t, og_mh,
+        # Metadata (5) - M, H, K, s1, s2
+        M, H, K, s1, s2,
+        # Strides (16)
+        x_flat.stride(0), x_flat.stride(1),
+        W_4way.stride(0), W_4way.stride(1),
+        W_og.stride(0), W_og.stride(1),
+        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
+        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
+        og_mh.stride(0), og_mh.stride(1),
+        mask_mh.stride(0), mask_mh.stride(1),
+        # Constexpr (1)
+        LN_EPS=1e-5
+    )
+    return torch_pt2(
+        left_final, right_final_t,
+        bs=bs,
+        s1=s1,
+        s2=s2,
+        d=d,
+        h=h,
+        to_out_norm_weight=to_out_norm_weight,
+        to_out_norm_bias=to_out_norm_bias,
+        og_mh=og_mh,
+        to_out_weight=to_out_weight
+    )
+def pack_w_4way_efficient(weights):
+    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
+    WL = weights['left_proj.weight']
+    WLG = weights['left_gate.weight']
+    WR = weights['right_proj.weight']
+    WRG = weights['right_gate.weight']
+    H, K = WL.shape
+    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2)
+    ws = ws.contiguous().view(4 * H, K)
+    return ws.t().to(torch.float16)
+def get_w_og(weights):
+    """ Gets the transposed [K, H] out_gate weight matrix. """
+    WOG = weights['out_gate.weight']
+    return WOG.t().to(torch.float16)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
+@torch.compile
+def compiledtrimul(
+    x: torch.Tensor,
+    mask: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    w_concat: torch.Tensor,
+    to_out_norm_weight: torch.Tensor,
+    to_out_norm_bias: torch.Tensor,
+    to_out_weight: torch.Tensor,
+    h: int
+) -> torch.Tensor:
+    """
+    A barebones, compiled PyTorch function for the TriMul logic.
+    """
+    bs, s1, s2, d = x.shape
+    # Initial LayerNorm
+    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
+    # Single large matmul: [M, d] @ [d, 5h] = [M, 5h]
+    all_projections = torch.mm(x_norm, w_concat)
+    # Split back into individual projections
+    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
+    # Apply mask and gates
+    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
+    left = left * mask_expanded * torch.sigmoid(lg)
+    right = right * mask_expanded * torch.sigmoid(rg)
+    out_gate = torch.sigmoid(og)
+    # Reshape for einsum
+    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
+    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
+    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
+    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
+    # Apply layer norm and final gating
+    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
+    gated = normed * out_gate
+    # Final projection
+    final_out_flat = gated @ to_out_weight.t()
+    final_out = final_out_flat.view(bs, s1, s2, d)
+    return final_out
+def small_kernel_pt_path(data):
+    input_tensor, mask, weights, config = data
+    w_concat = torch.cat([
+        weights['left_proj.weight'],
+        weights['right_proj.weight'],
+        weights['left_gate.weight'],
+        weights['right_gate.weight'],
+        weights['out_gate.weight']
+    ], dim=0).t().contiguous().to(torch.float16)
+    # Call the compiled function with prepared weights
+    output = compiledtrimul(
+        x=input_tensor.to(torch.float32),
+        mask=mask.unsqueeze(-1),
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        w_concat=w_concat,
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float32),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float32),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=config["hidden_dim"]
+    )
+    return output
+def kernel_h100(data):
+    input_tensor, mask, weights, config = data
+    bs, s1, s2, d = input_tensor.shape
+    if s1 <= 512:
+        return small_kernel_pt_path(data)
+    H = config["hidden_dim"]
+    W_4way = pack_w_4way_efficient(weights)
+    W_og = get_w_og(weights)
+    M = bs * s1 * s2
+    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16) #move into kernel possibly
+    return compiledtrimul_fused_interleaved(
+        x=input_tensor.to(torch.float32),
+        mask_mh=mask_mh,
+        norm_weight=weights['norm.weight'].to(torch.float32),
+        norm_bias=weights['norm.bias'].to(torch.float32),
+        W_4way=W_4way, # Pass the new 4-way matrix
+        W_og=W_og,     # Pass the new out_gate matrix
+        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
+        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
+        to_out_weight=weights['to_out.weight'].to(torch.float16),
+        h=H,
+    )