danieldk HF Staff commited on Jan 8

Commit

d2d3257

1 Parent(s): faed52f

Revert "Build uploaded using `kernels`."

Browse files

This reverts commit faed52fbc37782e891f85512bf2c486b4e6ff17a.

Files changed (30) hide show

build/torch-cuda/__init__.py +0 -7
build/torch-cuda/_ops.py +0 -8
build/torch-cuda/metadata.json +0 -1
build/torch-cuda/task.py +0 -20
build/torch-cuda/trimul_global.py +0 -971
build/torch-cuda/trimul_gpumode/__init__.py +0 -26
build/torch-cuda/trimul_mi300.py +0 -524
build/torch-cuda/triton_a100.py +0 -405
build/torch-cuda/triton_b200.py +0 -411
build/torch-cuda/triton_h100.py +0 -509
build/torch-rocm/__init__.py +0 -7
build/torch-rocm/_ops.py +0 -8
build/torch-rocm/metadata.json +0 -1
build/torch-rocm/task.py +0 -20
build/torch-rocm/trimul_global.py +0 -971
build/torch-rocm/trimul_gpumode/__init__.py +0 -26
build/torch-rocm/trimul_mi300.py +0 -524
build/torch-rocm/triton_a100.py +0 -405
build/torch-rocm/triton_b200.py +0 -411
build/torch-rocm/triton_h100.py +0 -509
build/torch-xpu/__init__.py +0 -7
build/torch-xpu/_ops.py +0 -8
build/torch-xpu/metadata.json +0 -1
build/torch-xpu/task.py +0 -20
build/torch-xpu/trimul_global.py +0 -971
build/torch-xpu/trimul_gpumode/__init__.py +0 -26
build/torch-xpu/trimul_mi300.py +0 -524
build/torch-xpu/triton_a100.py +0 -405
build/torch-xpu/triton_b200.py +0 -411
build/torch-xpu/triton_h100.py +0 -509

build/torch-cuda/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-from .triton_a100 import kernel_a100
-from .triton_h100 import kernel_h100
-from .triton_b200 import kernel_b200
-from .trimul_mi300 import kernel_mi300
-from .trimul_global import kernel_global
-__all__ = ["kernel_a100", "kernel_h100", "kernel_b200", "kernel_mi300", "kernel_global"]

build/torch-cuda/_ops.py DELETED Viewed

@@ -1,8 +0,0 @@
-import torch
-ops = torch.ops._trimul_gpumode_8e6e60d
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_trimul_gpumode_8e6e60d::{op_name}"

build/torch-cuda/metadata.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"python-depends":[]}

build/torch-cuda/task.py DELETED Viewed

@@ -1,20 +0,0 @@
-"""
-Type definitions for TriMul task.
-Input: Tuple of (input_tensor, mask, weights, config)
-  - input_tensor: Input tensor of shape [batch_size, seq_len, seq_len, dim]
-  - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
-  - weights: Dictionary containing model weights
-  - config: Dictionary containing model configuration parameters
-Output: Output tensor of shape [batch_size, seq_len, seq_len, dim]
-"""
-import torch
-from typing import Tuple, Dict, Any
-# Input type: (input_tensor, mask, weights, config)
-input_t = Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], Dict[str, Any]]
-# Output type: output tensor
-output_t = torch.Tensor

build/torch-cuda/trimul_global.py DELETED Viewed

@@ -1,971 +0,0 @@
-# from utils import make_match_reference, DisableCuDNNTF32
-from .task import input_t, output_t
-import torch
-from torch import nn, einsum
-import math
-import os
-import requests
-import triton
-import triton.language as tl
-# The flag below controls whether to allow TF32 on matmul. This flag defaults to False
-# in PyTorch 1.12 and later.
-torch.backends.cuda.matmul.allow_tf32 = True
-# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
-torch.backends.cudnn.allow_tf32 = True
-# Set allocator for TMA descriptors (required for on-device TMA)
-def alloc_fn(size: int, alignment: int, stream=None):
-    return torch.empty(size, device="cuda", dtype=torch.int8)
-triton.set_allocator(alloc_fn)
-# os.environ['TRITON_PRINT_AUTOTUNING'] = '1'
-# os.environ['MLIR_ENABLE_DIAGNOSTICS'] = 'warnings,remarks'
-# Reference code in PyTorch
-class TriMul(nn.Module):
-    # Based on https://github.com/lucidrains/triangle-multiplicative-module/blob/main/triangle_multiplicative_module/triangle_multiplicative_module.py
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-    ):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim)
-        self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
-        self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
-        self.left_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.to_out_norm = nn.LayerNorm(hidden_dim)
-        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
-    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        """
-        x: [bs, seq_len, seq_len, dim]
-        mask: [bs, seq_len, seq_len]
-        Returns:
-            output: [bs, seq_len, seq_len, dim]
-        """
-        batch_size, seq_len, _, dim = x.shape
-        x = self.norm(x)
-        left = self.left_proj(x)
-        right = self.right_proj(x)
-        mask = mask.unsqueeze(-1)
-        left = left * mask
-        right = right * mask
-        left_gate = self.left_gate(x).sigmoid()
-        right_gate = self.right_gate(x).sigmoid()
-        out_gate = self.out_gate(x).sigmoid()
-        left = left * left_gate
-        right = right * right_gate
-        out = einsum('... i k d, ... j k d -> ... i j d', left, right)
-        # This einsum is the same as the following:
-        # out = torch.zeros(batch_size, seq_len, seq_len, dim, device=x.device)
-        # # Compute using nested loops
-        # for b in range(batch_size):
-        #     for i in range(seq_len):
-        #         for j in range(seq_len):
-        #             # Compute each output element
-        #             for k in range(seq_len):
-        #                 out[b, i, j] += left[b, i, k, :] * right[b, j, k, :]
-        out = self.to_out_norm(out)
-        out = out * out_gate
-        return self.to_out(out)
-@triton.jit
-def triton_sigmoid(x):
-    """
-    Compute sigmoid function: 1 / (1 + exp(-x))
-    """
-    return 1.0 / (1.0 + tl.exp(-x))
-def two_mm_kernel_configs_wrapper():
-    if torch.cuda.get_device_capability() == (12, 0):
-        def two_mm_kernel_configs():
-            configs = []
-            for BLOCK_M in [16, 32]:
-                for BLOCK_N in [16, 32, 64]:
-                    for BLOCK_K in [16, 32, 64]:
-                        for num_stages in [2, 3]:
-                            configs.append(triton.Config({
-                                'BLOCK_M': BLOCK_M,
-                                'BLOCK_N': BLOCK_N,
-                                'BLOCK_K': BLOCK_K,
-                                'GROUP_SIZE_M': 8
-                            }, num_stages=num_stages, num_warps=8))
-            return configs
-    elif torch.cuda.get_device_capability()[0] == 9:
-        def get_optimal_two_mm_config_h100(B, seq_len, dim):
-            configs = {
-                (1, 128, 128): (128, 64, 128, 2, 8),
-                (1, 128, 256): (128, 64, 128, 2, 8),
-                (1, 128, 384): (128, 64, 64, 3, 8),
-                (1, 128, 512): (128, 64, 64, 3, 8),
-                (1, 128, 768): (128, 64, 64, 3, 8),
-                (1, 128, 1024): (128, 64, 64, 3, 8),
-                (1, 256, 128): (128, 64, 128, 2, 8),
-                (1, 256, 256): (128, 64, 128, 2, 8),
-                (1, 256, 384): (128, 64, 64, 3, 8),
-                (1, 256, 512): (128, 64, 64, 3, 8),
-                (1, 256, 768): (128, 64, 64, 3, 8),
-                (1, 256, 1024): (128, 64, 64, 3, 8),
-                (1, 512, 128): (128, 64, 128, 2, 8),
-                (1, 512, 256): (128, 64, 128, 2, 8),
-                (1, 512, 384): (128, 64, 128, 2, 8),
-                (1, 512, 512): (128, 64, 128, 2, 8),
-                (1, 512, 768): (128, 64, 64, 3, 8),
-                (1, 512, 1024): (128, 64, 64, 3, 8),
-                (1, 1024, 128): (128, 64, 128, 2, 8),
-                (1, 1024, 256): (128, 64, 64, 2, 8),
-                (1, 1024, 384): (128, 64, 128, 2, 8),
-                (1, 1024, 512): (128, 64, 128, 2, 8),
-                (1, 1024, 768): (128, 64, 128, 2, 8),
-                (1, 1024, 1024): (128, 64, 128, 2, 8),
-                (2, 128, 128): (128, 64, 128, 2, 8),
-                (2, 128, 256): (128, 64, 128, 2, 8),
-                (2, 128, 384): (128, 64, 64, 3, 8),
-                (2, 128, 512): (128, 64, 64, 3, 8),
-                (2, 128, 768): (128, 64, 64, 3, 8),
-                (2, 128, 1024): (128, 64, 64, 3, 8),
-                (2, 256, 128): (128, 64, 128, 2, 8),
-                (2, 256, 256): (128, 64, 128, 2, 8),
-                (2, 256, 384): (128, 64, 128, 2, 8),
-                (2, 256, 512): (128, 64, 128, 2, 8),
-                (2, 256, 768): (128, 64, 64, 3, 8),
-                (2, 256, 1024): (128, 64, 64, 3, 8),
-                (2, 512, 128): (128, 64, 128, 2, 8),
-                (2, 512, 256): (128, 64, 128, 2, 8),
-                (2, 512, 384): (128, 64, 128, 2, 8),
-                (2, 512, 512): (128, 64, 128, 2, 8),
-                (2, 512, 768): (128, 64, 128, 2, 8),
-                (2, 512, 1024): (128, 64, 128, 2, 8),
-                (2, 1024, 128): (128, 64, 128, 2, 8),
-                (2, 1024, 256): (128, 64, 128, 2, 8),
-                (2, 1024, 384): (128, 64, 128, 2, 8),
-                (2, 1024, 512): (128, 64, 128, 2, 8),
-                (2, 1024, 768): (128, 64, 128, 2, 8),
-                (2, 1024, 1024): (128, 64, 128, 2, 8),
-            }
-            return configs.get((B, seq_len, dim), (64, 64, 32, 2, 8))  # default fallback
-        def two_mm_kernel_configs():
-            # This function is kept for compatibility but will be overridden for H100
-            return [
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-            ]
-    elif torch.cuda.get_device_capability()[0] == 10 and False:
-        def get_optimal_two_mm_config(B, seq_len, dim):
-            configs = {
-                (1, 128, 128): (64, 128, 64, 2, 8),
-                (1, 128, 256): (128, 64, 128, 2, 8),
-                (1, 128, 384): (128, 64, 128, 2, 8),
-                (1, 128, 512): (128, 64, 128, 2, 8),
-                (1, 128, 768): (128, 64, 64, 3, 8),
-                (1, 128, 1024): (128, 64, 64, 3, 8),
-                (1, 256, 128): (128, 64, 128, 2, 8),
-                (1, 256, 256): (128, 64, 128, 2, 8),
-                (1, 256, 384): (128, 64, 128, 2, 8),
-                (1, 256, 512): (128, 64, 64, 3, 8),
-                (1, 256, 768): (128, 64, 64, 3, 8),
-                (1, 256, 1024): (128, 64, 64, 3, 8),
-                (1, 512, 128): (128, 64, 128, 2, 8),
-                (1, 512, 256): (128, 64, 128, 2, 8),
-                (1, 512, 384): (128, 64, 128, 2, 8),
-                (1, 512, 512): (128, 64, 128, 2, 8),
-                (1, 512, 768): (128, 64, 64, 3, 8),
-                (1, 512, 1024): (128, 64, 64, 3, 8),
-                (1, 1024, 128): (128, 64, 128, 2, 8),
-                (1, 1024, 256): (128, 64, 128, 2, 8),
-                (1, 1024, 384): (128, 64, 128, 2, 8),
-                (1, 1024, 512): (128, 64, 128, 2, 8),
-                (1, 1024, 768): (128, 64, 64, 3, 8),
-                (1, 1024, 1024): (128, 64, 64, 3, 8),
-                (2, 128, 128): (128, 64, 128, 2, 8),
-                (2, 128, 256): (128, 64, 128, 2, 8),
-                (2, 128, 384): (128, 64, 128, 2, 8),
-                (2, 128, 512): (128, 64, 64, 3, 8),
-                (2, 128, 768): (128, 64, 64, 3, 8),
-                (2, 128, 1024): (128, 64, 64, 3, 8),
-                (2, 256, 128): (128, 64, 128, 2, 8),
-                (2, 256, 256): (128, 64, 128, 2, 8),
-                (2, 256, 384): (128, 64, 128, 2, 8),
-                (2, 256, 512): (128, 64, 64, 3, 8),
-                (2, 256, 768): (128, 64, 64, 3, 8),
-                (2, 256, 1024): (128, 64, 64, 3, 8),
-                (2, 512, 128): (128, 64, 128, 2, 8),
-                (2, 512, 256): (128, 64, 128, 2, 8),
-                (2, 512, 384): (128, 64, 128, 2, 8),
-                (2, 512, 512): (128, 64, 128, 2, 8),
-                (2, 512, 768): (128, 64, 64, 3, 8),
-                (2, 512, 1024): (128, 64, 64, 3, 8),
-                (2, 1024, 128): (128, 64, 128, 2, 8),
-                (2, 1024, 256): (128, 64, 128, 2, 8),
-                (2, 1024, 384): (128, 64, 128, 2, 8),
-                (2, 1024, 512): (128, 64, 128, 2, 8),
-                (2, 1024, 768): (128, 64, 64, 3, 8),
-                (2, 1024, 1024): (128, 64, 64, 3, 8),
-            }
-            return configs.get((B, seq_len, dim), (64, 64, 32, 2, 8))  # default fallback
-        def two_mm_kernel_configs():
-            # This function is kept for compatibility but will be overridden
-            return [
-                triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-            ]
-    elif torch.cuda.get_device_capability()[0] == 8:
-        # A100
-        def two_mm_kernel_configs():
-            configs = []
-            for BLOCK_M in [64]:
-                for BLOCK_N in [64, 128]:
-                    for BLOCK_K in [16]:
-                        for num_stages in [3, 4]:
-                            for num_warps in [4, 8]:
-                                configs.append(triton.Config({
-                                    'BLOCK_M': BLOCK_M,
-                                    'BLOCK_N': BLOCK_N,
-                                    'BLOCK_K': BLOCK_K,
-                                    'GROUP_SIZE_M': 8
-                                }, num_stages=num_stages, num_warps=num_warps))
-            return configs
-    else:
-        def two_mm_kernel_configs():
-            configs = []
-            for BLOCK_M in [64, 128]:
-                for BLOCK_N in [64, 128]:
-                    for BLOCK_K in [64, 128]:
-                        for num_stages in [2, 3]:
-                            configs.append(triton.Config({
-                                'BLOCK_M': BLOCK_M,
-                                'BLOCK_N': BLOCK_N,
-                                'BLOCK_K': BLOCK_K,
-                                'GROUP_SIZE_M': 8
-                            }, num_stages=num_stages, num_warps=8))
-            return configs
-    return two_mm_kernel_configs
-def two_mm_kernel_wrapper():
-    if torch.cuda.get_device_capability()[0] == 8:
-        @triton.jit
-        def two_mm_kernel(a_ptr, b1_ptr, b2_ptr, b3_ptr, b4_ptr, b5_ptr, c1_ptr, c2_ptr, d_ptr, mask_ptr, M, N, K, stride_a0, stride_a1, stride_a2, stride_a3, stride_bk, stride_bn, stride_c0, stride_c1, stride_c2, stride_c3, seq_len, stride_d0, stride_d1, stride_d2, stride_d3, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
-            # Persistent kernel using standard tl.load operations
-            start_pid = tl.program_id(axis=0)
-            num_pid_m = tl.cdiv(M, BLOCK_M)
-            num_pid_n = tl.cdiv(N, BLOCK_N)
-            k_tiles = tl.cdiv(K, BLOCK_K)
-            num_tiles = num_pid_m * num_pid_n
-            # tile_id_c is used in the epilogue to break the dependency between
-            # the prologue and the epilogue
-            tile_id_c = start_pid - NUM_SMS
-            num_pid_in_group = GROUP_SIZE_M * num_pid_n
-            # Persistent loop over tiles
-            for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=False):
-                # Calculate PID for this tile using improved swizzling
-                group_id = tile_id // num_pid_in_group
-                first_pid_m = group_id * GROUP_SIZE_M
-                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-                pid_m = first_pid_m + (tile_id % group_size_m)
-                pid_n = (tile_id % num_pid_in_group) // group_size_m
-                # Calculate block offsets
-                offs_am = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-                offs_bn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-                offs_k = tl.arange(0, BLOCK_K)
-                # Initialize accumulators for all outputs
-                accumulator1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator3 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator4 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator_d = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                # Main computation loop over K dimension
-                for ki in range(k_tiles):
-                    k_start = ki * BLOCK_K
-                    k_offsets = k_start + offs_k
-                    # Create pointers for A matrix (2D flattened view)
-                    a_ptrs = a_ptr + offs_am[:, None] * stride_a2 + k_offsets[None, :] * stride_a3
-                    a_mask = (offs_am[:, None] < M) & (k_offsets[None, :] < K)
-                    # Create pointers for B matrices [N, K] layout
-                    b1_ptrs = b1_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b2_ptrs = b2_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b3_ptrs = b3_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b4_ptrs = b4_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b5_ptrs = b5_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b_mask = (offs_bn[:, None] < N) & (k_offsets[None, :] < K)
-                    # Load blocks from A and all weight matrices using standard tl.load
-                    a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-                    b1 = tl.load(b1_ptrs, mask=b_mask, other=0.0)
-                    b2 = tl.load(b2_ptrs, mask=b_mask, other=0.0)
-                    b3 = tl.load(b3_ptrs, mask=b_mask, other=0.0)
-                    b4 = tl.load(b4_ptrs, mask=b_mask, other=0.0)
-                    b5 = tl.load(b5_ptrs, mask=b_mask, other=0.0)
-                    # Perform matrix multiplications using TF32
-                    accumulator1 = tl.dot(a, b1.T, accumulator1, allow_tf32=True)  # A @ B1.T
-                    accumulator2 = tl.dot(a, b2.T, accumulator2, allow_tf32=True)  # A @ B2.T
-                    accumulator3 = tl.dot(a, b3.T, accumulator3, allow_tf32=True)  # A @ B3.T
-                    accumulator4 = tl.dot(a, b4.T, accumulator4, allow_tf32=True)  # A @ B4.T
-                    accumulator_d = tl.dot(a, b5.T, accumulator_d, allow_tf32=True)  # A @ B5.T
-                # Store results using separate tile_id_c for epilogue
-                tile_id_c += NUM_SMS
-                group_id = tile_id_c // num_pid_in_group
-                first_pid_m = group_id * GROUP_SIZE_M
-                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-                pid_m = first_pid_m + (tile_id_c % group_size_m)
-                pid_n = (tile_id_c % num_pid_in_group) // group_size_m
-                # Calculate output offsets and pointers
-                offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-                offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-                # Create masks for bounds checking
-                d_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-                # Calculate pointer addresses using 4D strides
-                stride_cm = stride_c2  # Stride to next element in flattened M dimension
-                stride_cn = stride_c3  # N is the innermost dimension
-                # For D tensor: use separate D strides
-                stride_dm = stride_d2  # Stride to next element in flattened M dimension
-                stride_dn = stride_d3  # N is the innermost dimension
-                off_c_batch = offs_cm // (seq_len * seq_len)
-                off_c_sl1 = (offs_cm // seq_len) % seq_len
-                off_c_sl2 = offs_cm % seq_len
-                off_c_dim = offs_cn
-                c_offsets = (off_c_batch * stride_c0 + off_c_sl1 * stride_c1 + off_c_sl2 * stride_c2)[:, None] + off_c_dim[None, :] * stride_c3
-                c_mask = d_mask
-                c1_ptrs = c1_ptr + c_offsets
-                c2_ptrs = c2_ptr + c_offsets
-                d_ptrs = d_ptr + stride_dm * offs_cm[:, None] + stride_dn * offs_cn[None, :]
-                mask = tl.load(mask_ptr + offs_cm, mask=(offs_cm < M))
-                # Broadcast mask to match accumulator dimensions [BLOCK_M, BLOCK_N]
-                mask_2d = mask[:, None]  # Convert to [BLOCK_M, 1] then broadcast
-                # Apply masking only to left_proj and right_proj results (C1, C2)
-                accumulator1 = tl.where(mask_2d, accumulator1, 0)
-                accumulator2 = tl.where(mask_2d, accumulator2, 0)
-                # Apply sigmoid to gate values
-                left_gate_sigmoid = triton_sigmoid(accumulator3)
-                right_gate_sigmoid = triton_sigmoid(accumulator4)
-                accumulator_d = triton_sigmoid(accumulator_d)
-                # Apply elementwise multiplication with gated values
-                # C1 = left * left_gate, C2 = right * right_gate
-                accumulator1 = accumulator1 * left_gate_sigmoid  # left * left_gate
-                accumulator2 = accumulator2 * right_gate_sigmoid  # right * right_gate
-                # Convert to appropriate output dtype and store with normal tl.store
-                c1 = accumulator1.to(c1_ptr.dtype.element_ty)
-                c2 = accumulator2.to(c2_ptr.dtype.element_ty)
-                d = accumulator_d.to(d_ptr.dtype.element_ty)
-                tl.store(c1_ptrs, c1, mask=c_mask)
-                tl.store(c2_ptrs, c2, mask=c_mask)
-                tl.store(d_ptrs, d, mask=d_mask)
-    else:
-        @triton.jit
-        def two_mm_kernel(a_ptr, b1_ptr, b2_ptr, b3_ptr, b4_ptr, b5_ptr, c1_ptr, c2_ptr, d_ptr, mask_ptr, M, N, K, stride_a0, stride_a1, stride_a2, stride_a3, stride_bk, stride_bn, stride_c0, stride_c1, stride_c2, stride_c3, seq_len, stride_d0, stride_d1, stride_d2, stride_d3, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
-            # Persistent kernel using on-device TMA descriptors
-            start_pid = tl.program_id(axis=0)
-            num_pid_m = tl.cdiv(M, BLOCK_M)
-            num_pid_n = tl.cdiv(N, BLOCK_N)
-            k_tiles = tl.cdiv(K, BLOCK_K)
-            num_tiles = num_pid_m * num_pid_n
-            # Create on-device TMA descriptors
-            a_desc = tl._experimental_make_tensor_descriptor(
-                a_ptr,
-                shape=[M, K],
-                strides=[stride_a2, stride_a3],
-                block_shape=[BLOCK_M, BLOCK_K],
-            )
-            b1_desc = tl._experimental_make_tensor_descriptor(
-                b1_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            b2_desc = tl._experimental_make_tensor_descriptor(
-                b2_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            b3_desc = tl._experimental_make_tensor_descriptor(
-                b3_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            b4_desc = tl._experimental_make_tensor_descriptor(
-                b4_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            b5_desc = tl._experimental_make_tensor_descriptor(
-                b5_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            # tile_id_c is used in the epilogue to break the dependency between
-            # the prologue and the epilogue
-            tile_id_c = start_pid - NUM_SMS
-            num_pid_in_group = GROUP_SIZE_M * num_pid_n
-            # Persistent loop over tiles
-            for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=False):
-                # Calculate PID for this tile using improved swizzling
-                group_id = tile_id // num_pid_in_group
-                first_pid_m = group_id * GROUP_SIZE_M
-                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-                pid_m = first_pid_m + (tile_id % group_size_m)
-                pid_n = (tile_id % num_pid_in_group) // group_size_m
-                # Calculate block offsets
-                offs_am = pid_m * BLOCK_M
-                offs_bn = pid_n * BLOCK_N
-                # Initialize accumulators for all outputs
-                accumulator1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator3 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator4 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator_d = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                # Main computation loop over K dimension
-                for ki in range(k_tiles):
-                    offs_k = ki * BLOCK_K
-                    # Load blocks from A and all weight matrices using on-device TMA
-                    a = a_desc.load([offs_am, offs_k])
-                    b1 = b1_desc.load([offs_bn, offs_k])
-                    b2 = b2_desc.load([offs_bn, offs_k])
-                    b3 = b3_desc.load([offs_bn, offs_k])
-                    b4 = b4_desc.load([offs_bn, offs_k])
-                    b5 = b5_desc.load([offs_bn, offs_k])
-                    # Perform matrix multiplications using TF32
-                    accumulator1 = tl.dot(a, b1.T, accumulator1, allow_tf32=True)  # A @ B1.T
-                    accumulator2 = tl.dot(a, b2.T, accumulator2, allow_tf32=True)  # A @ B2.T
-                    accumulator3 = tl.dot(a, b3.T, accumulator3, allow_tf32=True)  # A @ B3.T
-                    accumulator4 = tl.dot(a, b4.T, accumulator4, allow_tf32=True)  # A @ B4.T
-                    accumulator_d = tl.dot(a, b5.T, accumulator_d, allow_tf32=True)  # A @ B5.T
-                # Store results using separate tile_id_c for epilogue
-                tile_id_c += NUM_SMS
-                group_id = tile_id_c // num_pid_in_group
-                first_pid_m = group_id * GROUP_SIZE_M
-                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-                pid_m = first_pid_m + (tile_id_c % group_size_m)
-                pid_n = (tile_id_c % num_pid_in_group) // group_size_m
-                # Calculate output offsets and pointers
-                offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-                offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-                # Create masks for bounds checking
-                d_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-                # Calculate pointer addresses using 4D strides
-                # For C tensors: compute effective 2D strides from 4D strides
-                # Output tensor is [B, I, J, N], flattened to [M, N] where M = B*I*J
-                stride_cm = stride_c2  # Stride to next element in flattened M dimension
-                stride_cn = stride_c3  # N is the innermost dimension
-                # For D tensor: use separate D strides
-                stride_dm = stride_d2  # Stride to next element in flattened M dimension
-                stride_dn = stride_d3  # N is the innermost dimension
-                off_c_batch = offs_cm // (seq_len * seq_len)
-                off_c_sl1 = (offs_cm // seq_len) % seq_len
-                off_c_sl2 = offs_cm % seq_len
-                off_c_dim = offs_cn
-                # TODO update the mask_c so we don't IMA
-                c_offsets = (off_c_batch * stride_c0 + off_c_sl1 * stride_c1 + off_c_sl2 * stride_c2)[:, None] + off_c_dim[None, :] * stride_c3
-                # c_offsets = offs_cm[:, None] * stride_c2 + offs_cn[None, :] * stride_c3
-                c_mask = d_mask
-                c1_ptrs = c1_ptr + c_offsets
-                c2_ptrs = c2_ptr + c_offsets
-                # c1_ptrs = c1_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-                # c2_ptrs = c2_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-                d_ptrs = d_ptr + stride_dm * offs_cm[:, None] + stride_dn * offs_cn[None, :]
-                mask = tl.load(mask_ptr + offs_cm, mask=(offs_cm < M))
-                # Broadcast mask to match accumulator dimensions [BLOCK_M, BLOCK_N]
-                mask_2d = mask[:, None]  # Convert to [BLOCK_M, 1] then broadcast
-                # Apply masking only to left_proj and right_proj results (C1, C2)
-                accumulator1 = tl.where(mask_2d, accumulator1, 0)
-                accumulator2 = tl.where(mask_2d, accumulator2, 0)
-                # Apply sigmoid to gate values
-                left_gate_sigmoid = triton_sigmoid(accumulator3)
-                right_gate_sigmoid = triton_sigmoid(accumulator4)
-                accumulator_d = triton_sigmoid(accumulator_d)
-                # Apply elementwise multiplication with gated values
-                # C1 = left * left_gate, C2 = right * right_gate
-                accumulator1 = accumulator1 * left_gate_sigmoid  # left * left_gate
-                accumulator2 = accumulator2 * right_gate_sigmoid  # right * right_gate
-                # Convert to appropriate output dtype and store with normal tl.store
-                c1 = accumulator1.to(c1_ptr.dtype.element_ty)
-                c2 = accumulator2.to(c2_ptr.dtype.element_ty)
-                d = accumulator_d.to(d_ptr.dtype.element_ty)
-                tl.store(c1_ptrs, c1, mask=c_mask)
-                tl.store(c2_ptrs, c2, mask=c_mask)
-                tl.store(d_ptrs, d, mask=d_mask)
-    if torch.cuda.get_device_capability()[0] not in [9, 10.2]:
-        two_mm_kernel = triton.autotune(
-            (two_mm_kernel_configs_wrapper())(), key=["M", "N", "K"]
-        )(two_mm_kernel)
-    return two_mm_kernel
-def two_mm(A, left_proj, right_proj, left_gate, right_gate, out_gate, mask):
-    """
-    Persistent matrix multiplication for all weight matrices using on-device TMA descriptors.
-    Args:
-        A: [..., K] tensor (arbitrary leading dimensions)
-        left_proj: [N, K] matrix (will be transposed)
-        right_proj: [N, K] matrix (will be transposed)
-        left_gate: [N, K] left gate weight matrix
-        right_gate: [N, K] right gate weight matrix
-        out_gate: [N, K] output gate weight matrix
-        mask: mask tensor
-    Returns:
-        (C1, C2, D): Tuple of result tensors [..., N] with same leading dims as A
-            C1 = (A @ left_proj.T) * sigmoid(A @ left_gate.T) (masked)
-            C2 = (A @ right_proj.T) * sigmoid(A @ right_gate.T) (masked)
-            D = sigmoid(A @ out_gate.T) (unmasked)
-    """
-    # Check constraints
-    assert A.shape[-1] == left_proj.shape[1] == right_proj.shape[1], "Incompatible K dimensions"
-    assert A.dtype == left_proj.dtype == right_proj.dtype, "Incompatible dtypes"
-    # Assert that all weight matrices have the same strides (same [N, K] shape)
-    assert left_proj.stride() == right_proj.stride() == left_gate.stride() == right_gate.stride() == out_gate.stride(), \
-        "All weight matrices must have identical strides"
-    # Get dimensions
-    original_shape = A.shape[:-1]  # All dimensions except the last
-    K = A.shape[-1]
-    N = left_proj.shape[0]
-    B, seq_len, _, _ = A.shape
-    dtype = A.dtype
-    # Flatten A to 2D for kernel processing
-    A_2d = A.view(-1, K)  # [M, K] where M is product of all leading dims
-    M = A_2d.shape[0]
-    # Get number of streaming multiprocessors
-    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
-    # Launch persistent kernel with limited number of blocks
-    grid = lambda META: (min(NUM_SMS, triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"])),)
-    # Get original 4D strides for A and output tensors
-    A_strides = A.stride()  # (stride_0, stride_1, stride_2, stride_3)
-    # Create output tensors with proper 4D shape to get correct strides
-    output_shape = original_shape + (N,)
-    # C1 = torch.empty(output_shape, device=A.device, dtype=dtype)
-    # C2 = torch.empty(output_shape, device=A.device, dtype=dtype)
-    C1 = torch.empty(B, N, seq_len, seq_len, device=A.device, dtype=torch.float16).permute(0, 2, 3, 1)
-    C2 = torch.empty(B, N, seq_len, seq_len, device=A.device, dtype=torch.float16).permute(0, 2, 3, 1)
-    D = torch.empty(output_shape, device=A.device, dtype=torch.float16)
-    C_strides = C1.stride()  # (stride_0, stride_1, stride_2, stride_3)
-    D_strides = D.stride()   # (stride_0, stride_1, stride_2, stride_3)
-    # Use optimal configuration for B200/H100 or fallback to autotuning for other GPUs
-    if torch.cuda.get_device_capability()[0] == 10:
-        # Get optimal configuration for B200
-        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps = (two_mm_kernel_configs_wrapper())(B, seq_len, K)
-        grid_size = min(NUM_SMS, triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N))
-        two_mm_kernel_wrapper()[(grid_size,)](
-            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
-            C1, C2, D, mask,
-            M, N, K,
-            *A_strides,  # 4D strides for A
-            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
-            *C_strides,  # 4D strides for C
-            seq_len,
-            *D_strides,  # 4D strides for D
-            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_SIZE_M=8, NUM_SMS=NUM_SMS,
-            num_stages=num_stages, num_warps=num_warps
-        )
-    elif torch.cuda.get_device_capability()[0] == 9:
-        # Get optimal configuration for H100
-        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps = (two_mm_kernel_configs_wrapper())(B, seq_len, K)
-        grid_size = min(NUM_SMS, triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N))
-        two_mm_kernel_wrapper()[(grid_size,)](
-            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
-            C1, C2, D, mask,
-            M, N, K,
-            *A_strides,  # 4D strides for A
-            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
-            *C_strides,  # 4D strides for C
-            seq_len,
-            *D_strides,  # 4D strides for D
-            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_SIZE_M=8, NUM_SMS=NUM_SMS,
-            num_stages=num_stages, num_warps=num_warps
-        )
-    else:
-        # Use autotuning for other GPUs
-        two_mm_kernel_wrapper()[grid](
-            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
-            C1, C2, D, mask,
-            M, N, K,
-            *A_strides,  # 4D strides for A
-            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
-            *C_strides,  # 4D strides for C
-            seq_len,
-            *D_strides,  # 4D strides for D
-            NUM_SMS=NUM_SMS
-        )
-    return C1, C2, D
-def second_layernorm_mul(inp, hidden_dim, weight, bias, mul_operand):
-    ln = torch.nn.functional.layer_norm(inp, (hidden_dim,), eps=1e-5, weight=weight.to(inp.dtype), bias=bias.to(inp.dtype))
-    out = ln * mul_operand
-    return out
-'''
-@triton.autotune(
-    [triton.Config({"ROW_BLOCK_SIZE": 16}, num_warps=4, num_stages=3)],
-    key=["R", "C"]
-)
-'''
-@triton.jit
-def layernorm_kernel_first(
-    X,
-    Y,
-    Weight,
-    Bias,
-    R,
-    C,  # aka "dim"
-    eps,
-    ROW_BLOCK_SIZE: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    row = tl.program_id(0) * ROW_BLOCK_SIZE + tl.arange(0, ROW_BLOCK_SIZE)
-    cols = tl.arange(0, BLOCK_SIZE)
-    mask_row = row < R
-    mask_col = cols < C
-    # Simple indexing for contiguous data
-    x = tl.load(
-        X + row[:, None] * C + cols[None, :],
-        mask=mask_row[:, None] & mask_col[None, :],
-        other=0.0
-    ).to(tl.float32)
-    weight = tl.load(Weight + cols, mask=mask_col, other=0.0).to(tl.float32)
-    bias = tl.load(Bias + cols, mask=mask_col, other=0.0).to(tl.float32)
-    mean = tl.sum(x, axis=1) / C
-    diff = tl.where(mask_row[:, None] & mask_col[None, :], x - mean[:, None], 0)
-    var = tl.sum(diff * diff, axis=1) / C
-    rstd = 1 / tl.sqrt(var + eps)
-    y_hat = (x - mean[:, None]) * rstd[:, None]
-    y = y_hat * weight[None, :] + bias[None, :]
-    tl.store(
-        Y + row[:, None] * C + cols[None, :],
-        y,
-        mask=mask_row[:, None] & mask_col[None, :]
-    )
-def get_optimal_config_ln(dim):
-    config = None
-    if torch.cuda.get_device_capability()[0] == 9:
-        if (dim <= 256):
-            config = (16, 1)
-        elif dim <= 512:
-            config = (16, 2)
-        elif dim <= 1024:
-            config = (16, 4)
-    if not config:
-        config = (16, 4)
-    return config
-def triton_layernorm_first(x, weight, bias, eps=1e-5, num_warps=None, ROW_BLOCK_SIZE=None):
-    B, seq_len, seq_len2, dim = x.shape
-    assert(seq_len == seq_len2)
-    R = B * seq_len * seq_len
-    C = dim
-    out = torch.empty_like(x, dtype=torch.float16)
-    if not num_warps or not ROW_BLOCK_SIZE:
-        ROW_BLOCK_SIZE, num_warps = get_optimal_config_ln(dim)
-    BLOCK_SIZE = triton.next_power_of_2(C)
-    assert(BLOCK_SIZE <= 1024)
-    def grid(meta):
-        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
-    layernorm_kernel_first[grid](
-        x, out, weight, bias,
-        R, C, eps,
-        ROW_BLOCK_SIZE=ROW_BLOCK_SIZE,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
-        num_stages=3
-    )
-    return out
-'''
-def triton_layernorm_first(x, weight, bias, eps=1e-5):
-    B, seq_len, seq_len2, dim = x.shape
-    assert(seq_len == seq_len2)
-    R = B * seq_len * seq_len
-    C = dim
-    out = torch.empty_like(x)
-    BLOCK_SIZE = triton.next_power_of_2(C)
-    assert(BLOCK_SIZE <= 1024)
-    def grid(meta):
-        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
-    layernorm_kernel_first[grid](
-        x, out, weight, bias,
-        R, C, eps,
-        BLOCK_SIZE=BLOCK_SIZE
-    )
-    return out
-'''
-@triton.autotune(
-    [triton.Config({"ROW_BLOCK_SIZE": 16}, num_warps=1, num_stages=3)],
-    key=[]
-)
-@triton.jit
-def layernorm_kernel_eltwise(
-    X,
-    Y,
-    Weight,
-    Bias,
-    OutGate,
-    seq_len,
-    stride_batch,
-    stride_dim,
-    R,
-    C,  # aka "dim"
-    eps,
-    ROW_BLOCK_SIZE: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    row = tl.program_id(0) * ROW_BLOCK_SIZE + tl.arange(0, ROW_BLOCK_SIZE)
-    cols = tl.arange(0, BLOCK_SIZE)
-    # Calculate base pointer for this batch of rows
-    tl.device_assert(seq_len*seq_len % ROW_BLOCK_SIZE == 0)
-    # batch_offset = (row // (stride_seq1 // stride_dim)) * stride_batch
-    batch = tl.program_id(0) * ROW_BLOCK_SIZE // (seq_len * seq_len)
-    seqs_off = row % (seq_len * seq_len) # TODO is this going to prevent vectorization
-    off_r = batch * stride_batch + seqs_off
-    off_c = cols * stride_dim
-    mask_row = row < R
-    mask_col = cols < C
-    out_gate = tl.load(
-        OutGate + row[:, None] * C + cols[None, :],
-        mask = mask_row[:, None] & mask_col[None, :],
-    )
-    x = tl.load(
-        X + off_r[:, None] + off_c[None, :],
-        mask=mask_row[:, None] & mask_col[None, :],
-        other=0.0
-    ).to(tl.float32)
-    weight = tl.load(Weight + cols, mask=mask_col, other=0.0).to(tl.float32)
-    bias = tl.load(Bias + cols, mask=mask_col, other=0.0).to(tl.float32)
-    mean = tl.sum(x, axis=1) / C
-    diff = tl.where(mask_row[:, None] & mask_col[None, :], x - mean[:, None], 0)
-    var = tl.sum(diff * diff, axis=1) / C
-    rstd = 1 / tl.sqrt(var + eps)
-    y_hat = (x - mean[:, None]) * rstd[:, None]
-    y = y_hat * weight[None, :] + bias[None, :]
-    tl.store(
-        Y + row[:, None] * C + cols[None, :],
-        y * out_gate,
-        mask=mask_row[:, None] & mask_col[None, :]
-    )
-def triton_layernorm_eltwise(x, weight, bias, out_gate, eps=1e-5):
-    B, seq_len, seq_len2, dim = x.shape
-    assert(seq_len == seq_len2)
-    R = B * seq_len * seq_len
-    assert(x.stride(3) == seq_len*seq_len)
-    assert(out_gate.is_contiguous())
-    C = dim
-    out = torch.empty_like(out_gate, dtype=torch.float32)
-    BLOCK_SIZE = triton.next_power_of_2(C)
-    assert(BLOCK_SIZE == 128)
-    def grid(meta):
-        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
-    layernorm_kernel_eltwise[grid](
-        x, out, weight, bias, out_gate,
-        seq_len,
-        x.stride(0), x.stride(3),
-        R, C, eps,
-        BLOCK_SIZE=BLOCK_SIZE
-    )
-    return out
-def kernel_global(data: input_t) -> output_t:
-    """
-    Reference implementation of TriMul using PyTorch.
-    Args:
-        data: Tuple of (input: torch.Tensor, mask: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
-            - input: Input tensor of shape [batch_size, seq_len, seq_len, dim]
-            - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
-            - weights: Dictionary containing model weights
-            - config: Dictionary containing model configuration parameters
-    """
-    input_tensor, mask, weights, config = data
-    left_proj_weight = weights["left_proj.weight"].to(torch.float16)
-    right_proj_weight = weights["right_proj.weight"].to(torch.float16)
-    left_gate_weight = weights["left_gate.weight"].to(torch.float16)
-    right_gate_weight = weights["right_gate.weight"].to(torch.float16)
-    out_gate_weight = weights["out_gate.weight"].to(torch.float16)
-    hidden_dim = config["hidden_dim"]
-    # trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
-    x = input_tensor
-    batch_size, seq_len, _, dim = x.shape
-    x = triton_layernorm_first(x, weights['norm.weight'], weights['norm.bias'])
-    # x = torch.nn.functional.layer_norm(x, (dim,), eps=1e-5, weight=weights['norm.weight'], bias=weights['norm.bias'])
-    left, right, out_gate = two_mm(x, left_proj_weight, right_proj_weight, left_gate_weight, right_gate_weight, out_gate_weight, mask)
-    # left = torch.nn.functional.linear(x, weights['left_proj.weight'].to(torch.float16))
-    # right = torch.nn.functional.linear(x, weights['right_proj.weight'].to(torch.float16))
-    # left = left * mask.unsqueeze(-1)
-    # right = right * mask.unsqueeze(-1)
-    '''
-    left = left.to(torch.float32)
-    right = right.to(torch.float32)
-    x = x.to(torch.float32)
-    left_gate = left_gate.sigmoid()
-    right_gate = right_gate.sigmoid()
-    out_gate = out_gate.sigmoid()
-    '''
-    # Elementwise multiplication now handled in kernel
-    # left = left * left_gate
-    # right = right * right_gate
-    # out = einsum('... i k d, ... j k d -> ... i j d', left, right)
-    out = torch.bmm(left.permute(0, 3, 1, 2).view(-1, left.shape[1], left.shape[2]), right.permute(0, 3, 2, 1).view(-1, right.shape[2], right.shape[1]))
-    out = out.view(batch_size, hidden_dim, seq_len, seq_len).permute(0, 2, 3, 1)
-    # out = torch.compile(second_layernorm_mul, dynamic=False)(out, hidden_dim, weights['to_out_norm.weight'], weights['to_out_norm.bias'], out_gate)
-    out = triton_layernorm_eltwise(out, weights['to_out_norm.weight'], weights['to_out_norm.bias'], out_gate)
-    # out = torch.nn.functional.layer_norm(out, (hidden_dim,), eps=1e-5, weight=weights['to_out_norm.weight'].to(out.dtype), bias=weights['to_out_norm.bias'].to(out.dtype))
-    # out = out * out_gate
-    return torch.nn.functional.linear(out, weights['to_out.weight'])
-    '''
-    # Fill in the given weights of the model
-    trimul.norm.weight = nn.Parameter(weights['norm.weight'])
-    trimul.norm.bias = nn.Parameter(weights['norm.bias'])
-    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
-    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
-    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
-    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
-    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
-    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
-    trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'])
-    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
-    output = trimul(input_tensor, mask)
-    return output
-    '''

build/torch-cuda/trimul_gpumode/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch-cuda/trimul_mi300.py DELETED Viewed

@@ -1,524 +0,0 @@
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16},  num_warps=4, num_stages=2),
-        # Configurations with larger block sizes for better data reuse
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=2),
-        # Configurations with deeper K dimension
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=2),
-        # More extreme configurations to test the limits
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=4, num_stages=2),
-        # Configurations with fewer warps
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=2, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=3),
-    ],
-    key=['M', 'N', 'K'],
-)
-@triton.jit
-def fused_ln_dual_matmul_kernel(
-    # Pointers (9)
-    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
-    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
-    # Metadata (5)
-    M, H, K, s1, s2,
-    # Strides (16)
-    stride_x_m, stride_x_k,
-    stride_w4_k, stride_w4_n,
-    stride_wog_k, stride_wog_n,
-    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
-    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
-    stride_og_m, stride_og_h,
-    stride_mask_m, stride_mask_h,
-    # Constexpr (from decorator and kwargs)
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
-):
-    # --- PID Mapping: Based on the LARGER 4*H problem ---
-    pid = tl.program_id(axis=0)
-    N_4way = 4 * H
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    # --- SHARED LayerNorm calculation (done only ONCE) ---
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    m_mask = offs_m < M
-    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
-    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        mean += tl.sum(x_chunk, axis=1)
-    mean /= K
-    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        x_centered = x_chunk - mean[:, None]
-        var += tl.sum(x_centered * x_centered, axis=1)
-    var /= K
-    rstd = 1.0 / tl.sqrt(var + LN_EPS)
-    # --- Matmul Loop 1: For the 4-Way Projections ---
-    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
-    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        k_block_start = k * BLOCK_SIZE_K;
-        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
-        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
-        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
-        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
-        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
-        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
-        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
-        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
-        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
-        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-        accumulator_4way += tl.dot(x_norm_tile, w_tile)
-        #Some threads should calclate out_gate
-        if pid_n * BLOCK_SIZE_N < H:
-            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
-            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
-            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
-            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-            accumulator_og += tl.dot(x_norm_tile, w_tile)
-    if pid_n * BLOCK_SIZE_N < H:
-        og_out = tl.sigmoid(accumulator_og)
-        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
-        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
-        tl.store(outg_ptrs, og_out, mask=og_mask)
-    # --- Fusion Logic for 4-Way Part ---
-    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
-    role_idx = tl.arange(0, 4)[None, None, :]
-    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
-    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
-    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
-    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
-    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
-    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
-    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
-    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
-    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
-    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
-    s1s2 = s1 * s2
-    offs_b  = offs_m // s1s2
-    offs_s1 = (offs_m % s1s2) // s2
-    offs_s2 = offs_m % s2
-    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
-    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
-    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
-    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
-    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
-                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
-    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
-                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1) # s2 offset uses s2 stride, s1 offset uses s1 stride
-    tl.store(outl_ptrs, left_out, mask=m_mask_h)
-    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
-    ],
-    key=['s1', 's2', 'H'],
-)
-@triton.jit
-def bmm_coalesced_kernel(
-    # Pointers
-    Left_ptr, Right_ptr, Out_ptr,
-    # Dimensions
-    bs, s1, s2, H,
-    # Strides
-    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
-    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
-    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
-    # Kernel parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # Grid and program IDs
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    pid_bh = tl.program_id(axis=1)
-    pid_b = pid_bh // H
-    pid_h = pid_bh % H
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
-    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
-        k_start = k * BLOCK_SIZE_K
-        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
-        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
-        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
-        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-        accumulator += tl.dot(a, b)
-    # --- Coalesced Write ---
-    # Write to a standard (bs, H, s1, s1) layout
-    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
-               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
-    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
-    tl.store(out_ptrs, accumulator, mask=c_mask)
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
-    ],
-    key=['H', 'D'],
-)
-@triton.jit
-def fused_final_kernel(
-    # Pointers
-    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
-    # Metadata
-    M, H, D, s1, # M_gate = bs*s1*s2
-    # Strides
-    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
-    stride_gate_m, stride_gate_h,
-    stride_proj_d, stride_proj_h,
-    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
-    # Constants
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # --- Grid and PID Setup for Matmul ---
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    m_mask = offs_m < M
-    # Decompose M back to (b, r, c) for reordering lookups
-    s1s1 = s1 * s1
-    b = offs_m // s1s1
-    r = (offs_m % s1s1) // s1
-    c = offs_m % s1
-    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
-        # Accumulate sum and sum of squares in one pass
-        sum_x += tl.sum(in_chunk, axis=1)
-        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
-    # Finalize statistics
-    mean = sum_x / H
-    var = (sum_x2 / H) - (mean * mean)
-    rstd = tl.math.rsqrt(var + LN_EPS)
-    # --- Pass 3: Fused Gating and Matmul ---
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_norm = (a - mean[:, None]) * rstd[:, None]
-        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
-        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
-        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
-        proj_ptrs = ProjW_ptr + \
-                    offs_n[None, :] * stride_proj_d + \
-                    offs_k[:, None] * stride_proj_h
-        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
-        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_gated = a_norm * gate
-        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
-        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
-    # --- Store Final Output ---
-    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
-    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
-    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
-def compiledtrimul_fused_interleaved(
-    x: torch.Tensor,
-    mask_mh: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    W_4way: torch.Tensor, # Use the new weight matrices
-    W_og: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int,
-):
-    bs, s1, s2, d = x.shape
-    M, K, H = bs * s1 * s2, x.shape[-1], h
-    x_flat = x.view(M, K)
-    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
-    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
-    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
-    # The grid is launched for the larger 4*H problem
-    N_4way = 4 * H
-    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
-    fused_ln_dual_matmul_kernel[grid](
-        # Pointers (9)
-        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
-        left_final, right_final_t, og_mh,
-        # Metadata (5) - M, H, K, s1, s2
-        M, H, K, s1, s2,
-        # Strides (16)
-        x_flat.stride(0), x_flat.stride(1),
-        W_4way.stride(0), W_4way.stride(1),
-        W_og.stride(0), W_og.stride(1),
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        og_mh.stride(0), og_mh.stride(1),
-        mask_mh.stride(0), mask_mh.stride(1),
-        # Constexpr (1)
-        LN_EPS=1e-5
-    )
-    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
-    grid_bmm = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
-    bmm_coalesced_kernel[grid_bmm](
-        left_final, right_final_t, bmm_out_tmp,
-        bs, s1, s2, H,
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-    )
-    # --- Kernel 3: Fully Fused Final Stage ---
-    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
-    grid_final = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
-    fused_final_kernel[grid_final](
-        # Pointers
-        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
-        # Metadata
-        M, H, d, s1,
-        # Strides
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        og_mh.stride(0), og_mh.stride(1),
-        to_out_weight.stride(0), to_out_weight.stride(1), # Use strides of the corrected tensor
-        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
-        # Constants
-        LN_EPS=1e-5,
-    )
-    return final_out
-def pack_w_4way_efficient(weights):
-    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
-    WL = weights['left_proj.weight']
-    WLG = weights['left_gate.weight']
-    WR = weights['right_proj.weight']
-    WRG = weights['right_gate.weight']
-    H, K = WL.shape
-    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2)
-    ws = ws.contiguous().view(4 * H, K)
-    return ws.t().to(torch.float16)
-def get_w_og(weights):
-    """ Gets the transposed [K, H] out_gate weight matrix. """
-    WOG = weights['out_gate.weight']
-    return WOG.t().to(torch.float16)
-def compiledtrimul(
-    x: torch.Tensor,
-    mask: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    w_concat: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int
-) -> torch.Tensor:
-    """
-    A barebones, compiled PyTorch function for the TriMul logic.
-    """
-    bs, s1, s2, d = x.shape
-    # Initial LayerNorm
-    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
-    # Single large matmul: [M, d] @ [d, 5h] = [M, 5h]
-    all_projections = torch.mm(x_norm, w_concat)
-    # Split back into individual projections
-    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
-    # Apply mask and gates
-    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
-    left = left * mask_expanded * torch.sigmoid(lg)
-    right = right * mask_expanded * torch.sigmoid(rg)
-    out_gate = torch.sigmoid(og)
-    # Reshape for einsum
-    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
-    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
-    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
-    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
-    # Apply layer norm and final gating
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * out_gate
-    # Final projection
-    final_out_flat = gated @ to_out_weight.t()
-    final_out = final_out_flat.view(bs, s1, s2, d)
-    return final_out
-def small_kernel_pt_path(data):
-    input_tensor, mask, weights, config = data
-    w_concat = torch.cat([
-        weights['left_proj.weight'],
-        weights['right_proj.weight'],
-        weights['left_gate.weight'],
-        weights['right_gate.weight'],
-        weights['out_gate.weight']
-    ], dim=0).t().contiguous().to(torch.float16)
-    # Call the compiled function with prepared weights
-    output = compiledtrimul(
-        x=input_tensor.to(torch.float32),
-        mask=mask.unsqueeze(-1),
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        w_concat=w_concat,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=config["hidden_dim"]
-    )
-    return output
-def kernel_mi300(data):
-    input_tensor, mask, weights, config = data
-    bs, s1, s2, d = input_tensor.shape
-    if s1 < 100:
-        return small_kernel_pt_path(data)
-    H = config["hidden_dim"]
-    W_4way = pack_w_4way_efficient(weights)
-    W_og = get_w_og(weights)
-    M = bs * s1 * s2
-    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16) #move into kernel possibly
-    return compiledtrimul_fused_interleaved(
-        x=input_tensor.to(torch.float32),
-        mask_mh=mask_mh,
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        W_4way=W_4way, # Pass the new 4-way matrix
-        W_og=W_og,     # Pass the new out_gate matrix
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=H,
-    )

build/torch-cuda/triton_a100.py DELETED Viewed

@@ -1,405 +0,0 @@
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-# Set PyTorch flags for performance
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@triton.jit
-def fused_ln_dual_matmul_kernel(
-    # Pointers (9)
-    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
-    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
-    # Metadata (5)
-    M, H, K, s1, s2,
-    # Strides (16)
-    stride_x_m, stride_x_k,
-    stride_w4_k, stride_w4_n,
-    stride_wog_k, stride_wog_n,
-    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
-    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
-    stride_og_m, stride_og_h,
-    stride_mask_m, stride_mask_h,
-    # Constexpr (now passed as arguments from the host)
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
-):
-    # --- PID Mapping: Based on the LARGER 4*H problem ---
-    pid = tl.program_id(axis=0)
-    N_4way = 4 * H
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    # --- SHARED LayerNorm calculation (done only ONCE) ---
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    m_mask = offs_m < M
-    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
-    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        mean += tl.sum(x_chunk, axis=1)
-    mean /= K
-    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        x_centered = x_chunk - mean[:, None]
-        var += tl.sum(x_centered * x_centered, axis=1)
-    var /= K
-    rstd = 1.0 / tl.sqrt(var + LN_EPS)
-    # --- Matmul Loop 1: For the 4-Way Projections ---
-    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
-    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        k_block_start = k * BLOCK_SIZE_K;
-        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
-        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
-        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
-        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
-        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
-        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
-        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
-        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
-        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
-        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-        accumulator_4way += tl.dot(x_norm_tile, w_tile)
-        if pid_n * BLOCK_SIZE_N < H:
-            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
-            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
-            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
-            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-            accumulator_og += tl.dot(x_norm_tile, w_tile)
-    if pid_n * BLOCK_SIZE_N < H:
-        og_out = tl.sigmoid(accumulator_og)
-        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
-        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
-        tl.store(outg_ptrs, og_out, mask=og_mask)
-    # --- Fusion Logic for 4-Way Part ---
-    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
-    role_idx = tl.arange(0, 4)[None, None, :]
-    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
-    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
-    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
-    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
-    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
-    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
-    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
-    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
-    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
-    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
-    s1s2 = s1 * s2
-    offs_b  = offs_m // s1s2
-    offs_s1 = (offs_m % s1s2) // s2
-    offs_s2 = offs_m % s2
-    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
-    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
-    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
-    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
-    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
-                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
-    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
-                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1)
-    tl.store(outl_ptrs, left_out, mask=m_mask_h)
-    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
-@triton.jit
-def bmm_coalesced_kernel(
-    # Pointers
-    Left_ptr, Right_ptr, Out_ptr,
-    # Dimensions
-    bs, s1, s2, H,
-    # Strides
-    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
-    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
-    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
-    # Kernel parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    pid_bh = tl.program_id(axis=1)
-    pid_b = pid_bh // H
-    pid_h = pid_bh % H
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
-    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
-        k_start = k * BLOCK_SIZE_K
-        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
-        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
-        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
-        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-        accumulator += tl.dot(a, b)
-    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
-               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
-    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
-    tl.store(out_ptrs, accumulator, mask=c_mask)
-@triton.jit
-def fused_final_kernel(
-    # Pointers
-    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
-    # Metadata
-    M, H, D, s1,
-    # Strides
-    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
-    stride_gate_m, stride_gate_h,
-    stride_proj_d, stride_proj_h,
-    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
-    # Constants
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    m_mask = offs_m < M
-    s1s1 = s1 * s1
-    b = offs_m // s1s1
-    r = (offs_m % s1s1) // s1
-    c = offs_m % s1
-    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
-        sum_x += tl.sum(in_chunk, axis=1)
-        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
-    mean = sum_x / H
-    var = (sum_x2 / H) - (mean * mean)
-    rstd = tl.math.rsqrt(var + LN_EPS)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_norm = (a - mean[:, None]) * rstd[:, None]
-        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
-        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
-        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
-        proj_ptrs = ProjW_ptr + offs_n[None, :] * stride_proj_d + offs_k[:, None] * stride_proj_h
-        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
-        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_gated = a_norm * gate
-        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
-        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
-    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
-    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
-    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
-def compiledtrimul_fused_interleaved_final(
-    x: torch.Tensor,
-    mask_mh: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    W_4way: torch.Tensor,
-    W_og: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int,
-):
-    bs, s1, s2, d = x.shape
-    M, K, H = bs * s1 * s2, x.shape[-1], h
-    x_flat = x.view(M, K)
-    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
-    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
-    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
-    # --- Kernel 1: Fused LN + Dual Matmul ---
-    N_4way = 4 * H
-    # Hardcoded A100 best config: M128-N128-K32-GM8-HC32-W8-S2
-    config_k1 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}
-    grid_k1 = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
-    fused_ln_dual_matmul_kernel[grid_k1](
-        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
-        left_final, right_final_t, og_mh,
-        M, H, K, s1, s2,
-        x_flat.stride(0), x_flat.stride(1), W_4way.stride(0), W_4way.stride(1),
-        W_og.stride(0), W_og.stride(1), left_final.stride(0), left_final.stride(1),
-        left_final.stride(2), left_final.stride(3), right_final_t.stride(0), right_final_t.stride(1),
-        right_final_t.stride(2), right_final_t.stride(3), og_mh.stride(0), og_mh.stride(1),
-        mask_mh.stride(0), mask_mh.stride(1),
-        LN_EPS=1e-5, **config_k1, num_warps=8, num_stages=2
-    )
-    # --- Kernel 2: Batched Matrix Multiplication ---
-    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
-    # Hardcoded A100 best config: M128-N64-K32-GM8-W4-S3
-    config_k2 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
-    grid_k2 = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
-    bmm_coalesced_kernel[grid_k2](
-        left_final, right_final_t, bmm_out_tmp,
-        bs, s1, s2, H,
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        **config_k2, num_warps=4, num_stages=3
-    )
-    # --- Kernel 3: Fully Fused Final Stage ---
-    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
-    # Hardcoded A100 best config: M32-N128-K32-GM8-W4-S3
-    config_k3 = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
-    grid_k3 = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
-    fused_final_kernel[grid_k3](
-        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
-        M, H, d, s1,
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        og_mh.stride(0), og_mh.stride(1), to_out_weight.stride(0), to_out_weight.stride(1),
-        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
-        LN_EPS=1e-5, **config_k3, num_warps=4, num_stages=3
-    )
-    return final_out
-def pack_w_4way_efficient(weights):
-    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
-    WL, WLG, WR, WRG = (weights[k] for k in ['left_proj.weight', 'left_gate.weight', 'right_proj.weight', 'right_gate.weight'])
-    H, K = WL.shape
-    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2).contiguous().view(4 * H, K)
-    return ws.t().to(torch.float16)
-def get_w_og(weights):
-    """ Gets the transposed [K, H] out_gate weight matrix. """
-    return weights['out_gate.weight'].t().to(torch.float16)
-@torch.compile()
-def compiledtrimul(
-    x: torch.Tensor, mask: torch.Tensor, norm_weight: torch.Tensor, norm_bias: torch.Tensor,
-    w_concat: torch.Tensor, to_out_norm_weight: torch.Tensor, to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor, h: int
-) -> torch.Tensor:
-    bs, s1, s2, d = x.shape
-    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
-    all_projections = torch.mm(x_norm, w_concat)
-    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
-    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
-    left = left * mask_expanded * torch.sigmoid(lg)
-    right = right * mask_expanded * torch.sigmoid(rg)
-    out_gate = torch.sigmoid(og)
-    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
-    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
-    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
-    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * out_gate
-    final_out_flat = gated @ to_out_weight.t()
-    return final_out_flat.view(bs, s1, s1, d)
-def small_kernel_pt_path(data):
-    input_tensor, mask, weights, config = data
-    w_concat = torch.cat([
-        weights['left_proj.weight'], weights['right_proj.weight'], weights['left_gate.weight'],
-        weights['right_gate.weight'], weights['out_gate.weight']
-    ], dim=0).t().contiguous().to(torch.float16)
-    return compiledtrimul(
-        x=input_tensor.to(torch.float32), mask=mask.unsqueeze(-1),
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32), w_concat=w_concat,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=config["hidden_dim"]
-    )
-def kernel_a100(data):
-    input_tensor, mask, weights, config = data
-    bs, s1, s2, d = input_tensor.shape
-    if s1 < 512: # Adjusted threshold based on observed BMM configs
-        return small_kernel_pt_path(data)
-    H = config["hidden_dim"]
-    W_4way = pack_w_4way_efficient(weights)
-    W_og = get_w_og(weights)
-    M = bs * s1 * s2
-    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16)
-    return compiledtrimul_fused_interleaved_final(
-        x=input_tensor.to(torch.float32),
-        mask_mh=mask_mh,
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        W_4way=W_4way,
-        W_og=W_og,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=H,
-    )

build/torch-cuda/triton_b200.py DELETED Viewed

@@ -1,411 +0,0 @@
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@triton.jit
-def fused_ln_dual_matmul_kernel(
-    # Pointers (9)
-    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
-    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
-    # Metadata (5)
-    M, H, K, s1, s2,
-    # Strides (16)
-    stride_x_m, stride_x_k,
-    stride_w4_k, stride_w4_n,
-    stride_wog_k, stride_wog_n,
-    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
-    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
-    stride_og_m, stride_og_h,
-    stride_mask_m, stride_mask_h,
-    # Constexpr (now passed as arguments from the host)
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
-):
-    # --- PID Mapping: Based on the LARGER 4*H problem ---
-    pid = tl.program_id(axis=0)
-    N_4way = 4 * H
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    # --- SHARED LayerNorm calculation (done only ONCE) ---
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    m_mask = offs_m < M
-    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
-    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        mean += tl.sum(x_chunk, axis=1)
-    mean /= K
-    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        x_centered = x_chunk - mean[:, None]
-        var += tl.sum(x_centered * x_centered, axis=1)
-    var /= K
-    rstd = 1.0 / tl.sqrt(var + LN_EPS)
-    # --- Matmul Loop 1: For the 4-Way Projections ---
-    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
-    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        k_block_start = k * BLOCK_SIZE_K;
-        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
-        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
-        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
-        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
-        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
-        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
-        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
-        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
-        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
-        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-        accumulator_4way += tl.dot(x_norm_tile, w_tile)
-        #Some threads should calclate out_gate
-        if pid_n * BLOCK_SIZE_N < H:
-            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
-            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
-            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
-            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-            accumulator_og += tl.dot(x_norm_tile, w_tile)
-    if pid_n * BLOCK_SIZE_N < H:
-        og_out = tl.sigmoid(accumulator_og)
-        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
-        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
-        tl.store(outg_ptrs, og_out, mask=og_mask)
-    # --- Fusion Logic for 4-Way Part ---
-    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
-    role_idx = tl.arange(0, 4)[None, None, :]
-    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
-    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
-    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
-    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
-    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
-    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
-    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
-    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
-    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
-    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
-    s1s2 = s1 * s2
-    offs_b  = offs_m // s1s2
-    offs_s1 = (offs_m % s1s2) // s2
-    offs_s2 = offs_m % s2
-    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
-    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
-    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
-    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
-    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
-                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
-    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
-                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1)
-    tl.store(outl_ptrs, left_out, mask=m_mask_h)
-    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
-@triton.jit
-def bmm_coalesced_kernel(
-    # Pointers
-    Left_ptr, Right_ptr, Out_ptr,
-    # Dimensions
-    bs, s1, s2, H,
-    # Strides
-    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
-    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
-    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
-    # Kernel parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # Grid and program IDs
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    pid_bh = tl.program_id(axis=1)
-    pid_b = pid_bh // H
-    pid_h = pid_bh % H
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
-    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
-        k_start = k * BLOCK_SIZE_K
-        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
-        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
-        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
-        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-        accumulator += tl.dot(a, b)
-    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
-               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
-    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
-    tl.store(out_ptrs, accumulator, mask=c_mask)
-@triton.jit
-def fused_final_kernel(
-    # Pointers
-    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
-    # Metadata
-    M, H, D, s1,
-    # Strides
-    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
-    stride_gate_m, stride_gate_h,
-    stride_proj_d, stride_proj_h,
-    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
-    # Constants
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    m_mask = offs_m < M
-    s1s1 = s1 * s1
-    b = offs_m // s1s1
-    r = (offs_m % s1s1) // s1
-    c = offs_m % s1
-    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
-        sum_x += tl.sum(in_chunk, axis=1)
-        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
-    mean = sum_x / H
-    var = (sum_x2 / H) - (mean * mean)
-    rstd = tl.math.rsqrt(var + LN_EPS)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_norm = (a - mean[:, None]) * rstd[:, None]
-        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
-        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
-        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
-        proj_ptrs = ProjW_ptr + offs_n[None, :] * stride_proj_d + offs_k[:, None] * stride_proj_h
-        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
-        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_gated = a_norm * gate
-        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
-        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
-    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
-    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
-    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
-def compiledtrimul_fused_interleaved_final(
-    x: torch.Tensor,
-    mask_mh: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    W_4way: torch.Tensor,
-    W_og: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int,
-):
-    bs, s1, s2, d = x.shape
-    M, K, H = bs * s1 * s2, x.shape[-1], h
-    x_flat = x.view(M, K)
-    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
-    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
-    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
-    # --- Kernel 1: Fused LN + Dual Matmul ---
-    # The grid is launched for the larger 4*H problem
-    N_4way = 4 * H
-    # Hardcoded best config from logs: M64-N128-K64-GM8-HC32-W4-S2
-    config_k1 = {'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}
-    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
-    fused_ln_dual_matmul_kernel[grid](
-        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
-        left_final, right_final_t, og_mh,
-        M, H, K, s1, s2,
-        x_flat.stride(0), x_flat.stride(1), W_4way.stride(0), W_4way.stride(1),
-        W_og.stride(0), W_og.stride(1), left_final.stride(0), left_final.stride(1),
-        left_final.stride(2), left_final.stride(3), right_final_t.stride(0), right_final_t.stride(1),
-        right_final_t.stride(2), right_final_t.stride(3), og_mh.stride(0), og_mh.stride(1),
-        mask_mh.stride(0), mask_mh.stride(1),
-        LN_EPS=1e-5, **config_k1, num_warps=4, num_stages=2
-    )
-    # --- Kernel 2: Batched Matrix Multiplication ---
-    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
-    # Hardcoded best config from logs: M128-N128-K32-GM8-W8-S3
-    config_k2 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
-    grid_bmm = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
-    bmm_coalesced_kernel[grid_bmm](
-        left_final, right_final_t, bmm_out_tmp,
-        bs, s1, s2, H,
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        **config_k2, num_warps=8, num_stages=3
-    )
-    # --- Kernel 3: Fully Fused Final Stage ---
-    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
-    # Hardcoded best config from logs: M32-N128-K32-GM8-W4-S3
-    config_k3 = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
-    grid_final = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
-    fused_final_kernel[grid_final](
-        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
-        M, H, d, s1,
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        og_mh.stride(0), og_mh.stride(1), to_out_weight.stride(0), to_out_weight.stride(1),
-        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
-        LN_EPS=1e-5, **config_k3, num_warps=4, num_stages=3
-    )
-    return final_out
-def pack_w_4way_efficient(weights):
-    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
-    WL, WLG, WR, WRG = (weights[k] for k in ['left_proj.weight', 'left_gate.weight', 'right_proj.weight', 'right_gate.weight'])
-    H, K = WL.shape
-    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2).contiguous().view(4 * H, K)
-    return ws.t().to(torch.float16)
-def get_w_og(weights):
-    """ Gets the transposed [K, H] out_gate weight matrix. """
-    return weights['out_gate.weight'].t().to(torch.float16)
-@torch.compile()
-def compiledtrimul(
-    x: torch.Tensor, mask: torch.Tensor, norm_weight: torch.Tensor, norm_bias: torch.Tensor,
-    w_concat: torch.Tensor, to_out_norm_weight: torch.Tensor, to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor, h: int
-) -> torch.Tensor:
-    bs, s1, s2, d = x.shape
-    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
-    all_projections = torch.mm(x_norm, w_concat)
-    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
-    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
-    left = left * mask_expanded * torch.sigmoid(lg)
-    right = right * mask_expanded * torch.sigmoid(rg)
-    out_gate = torch.sigmoid(og)
-    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
-    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
-    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
-    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * out_gate
-    final_out_flat = gated @ to_out_weight.t()
-    return final_out_flat.view(bs, s1, s1, d)
-def small_kernel_pt_path(data):
-    input_tensor, mask, weights, config = data
-    w_concat = torch.cat([
-        weights['left_proj.weight'], weights['right_proj.weight'], weights['left_gate.weight'],
-        weights['right_gate.weight'], weights['out_gate.weight']
-    ], dim=0).t().contiguous().to(torch.float16)
-    return compiledtrimul(
-        x=input_tensor.to(torch.float32), mask=mask.unsqueeze(-1),
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32), w_concat=w_concat,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=config["hidden_dim"]
-    )
-def kernel_b200(data):
-    input_tensor, mask, weights, config = data
-    bs, s1, s2, d = input_tensor.shape
-    if s1 < 800:
-        return small_kernel_pt_path(data)
-    H = config["hidden_dim"]
-    W_4way = pack_w_4way_efficient(weights)
-    W_og = get_w_og(weights)
-    M = bs * s1 * s2
-    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16)
-    return compiledtrimul_fused_interleaved_final(
-        x=input_tensor.to(torch.float32),
-        mask_mh=mask_mh,
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        W_4way=W_4way,
-        W_og=W_og,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=H,
-    )

build/torch-cuda/triton_h100.py DELETED Viewed

@@ -1,509 +0,0 @@
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16},  num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=5),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=4, num_stages=5),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=2, num_stages=4),
-    ],
-    key=['M', 'N', 'K'],
-)
-@triton.jit
-def fused_ln_dual_matmul_kernel(
-    # Pointers (9)
-    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
-    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
-    # Metadata (5)
-    M, H, K, s1, s2,
-    # Strides (16)
-    stride_x_m, stride_x_k,
-    stride_w4_k, stride_w4_n,
-    stride_wog_k, stride_wog_n,
-    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
-    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
-    stride_og_m, stride_og_h,
-    stride_mask_m, stride_mask_h,
-    # Constexpr (from decorator and kwargs)
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
-):
-    # --- PID Mapping: Based on the LARGER 4*H problem ---
-    pid = tl.program_id(axis=0)
-    N_4way = 4 * H
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    # --- SHARED LayerNorm calculation (done only ONCE) ---
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    m_mask = offs_m < M
-    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
-    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        mean += tl.sum(x_chunk, axis=1)
-    mean /= K
-    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        x_centered = x_chunk - mean[:, None]
-        var += tl.sum(x_centered * x_centered, axis=1)
-    var /= K
-    rstd = 1.0 / tl.sqrt(var + LN_EPS)
-    # --- Matmul Loop 1: For the 4-Way Projections ---
-    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
-    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        k_block_start = k * BLOCK_SIZE_K;
-        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
-        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
-        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
-        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
-        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
-        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
-        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
-        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
-        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
-        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-        accumulator_4way += tl.dot(x_norm_tile, w_tile)
-        #Some threads should calclate out_gate
-        if pid_n * BLOCK_SIZE_N < H:
-            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
-            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
-            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
-            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-            accumulator_og += tl.dot(x_norm_tile, w_tile)
-    if pid_n * BLOCK_SIZE_N < H:
-        og_out = tl.sigmoid(accumulator_og)
-        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
-        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
-        tl.store(outg_ptrs, og_out, mask=og_mask)
-    # --- Fusion Logic for 4-Way Part ---
-    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
-    role_idx = tl.arange(0, 4)[None, None, :]
-    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
-    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
-    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
-    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
-    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
-    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
-    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
-    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
-    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
-    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
-    s1s2 = s1 * s2
-    offs_b  = offs_m // s1s2
-    offs_s1 = (offs_m % s1s2) // s2
-    offs_s2 = offs_m % s2
-    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
-    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
-    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
-    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
-    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
-                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
-    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
-                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1) # s2 offset uses s2 stride, s1 offset uses s1 stride
-    tl.store(outl_ptrs, left_out, mask=m_mask_h)
-    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-    ],
-    key=['s1', 's2', 'H'],
-)
-@triton.jit
-def bmm_coalesced_kernel(
-    # Pointers
-    Left_ptr, Right_ptr, Out_ptr,
-    # Dimensions
-    bs, s1, s2, H,
-    # Strides
-    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
-    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
-    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
-    # Kernel parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # Grid and program IDs
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    pid_bh = tl.program_id(axis=1)
-    pid_b = pid_bh // H
-    pid_h = pid_bh % H
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
-    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
-        k_start = k * BLOCK_SIZE_K
-        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
-        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
-        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
-        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-        accumulator += tl.dot(a, b)
-    # --- Coalesced Write ---
-    # Write to a standard (bs, H, s1, s1) layout
-    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
-               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
-    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
-    tl.store(out_ptrs, accumulator, mask=c_mask)
-@torch.compile
-def torch_pt2(left_final, right_final_t, bs, s1, s2, d, h, to_out_norm_weight, to_out_norm_bias, og_mh, to_out_weight):
-    bmm_out = torch.matmul(left_final, right_final_t)
-    out_einsum_flat = bmm_out.permute(0, 2, 3, 1).reshape(bs * s1 * s1, h)
-    # Apply layer norm and final gating
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * og_mh
-    # Final projection
-    final_out_flat = gated @ to_out_weight.t()
-    final_out = final_out_flat.view(bs, s1, s2, d)
-    return final_out
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
-    ],
-    key=['H', 'D'],
-)
-@triton.jit
-def fused_final_kernel(
-    # Pointers
-    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
-    # Metadata
-    M, H, D, s1, # M_gate = bs*s1*s2
-    # Strides
-    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
-    stride_gate_m, stride_gate_h,
-    stride_proj_d, stride_proj_h,
-    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
-    # Constants
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # --- Grid and PID Setup for Matmul ---
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    m_mask = offs_m < M
-    # Decompose M back to (b, r, c) for reordering lookups
-    s1s1 = s1 * s1
-    b = offs_m // s1s1
-    r = (offs_m % s1s1) // s1
-    c = offs_m % s1
-    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
-        # Accumulate sum and sum of squares in one pass
-        sum_x += tl.sum(in_chunk, axis=1)
-        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
-    # Finalize statistics
-    mean = sum_x / H
-    var = (sum_x2 / H) - (mean * mean)
-    rstd = tl.math.rsqrt(var + LN_EPS)
-    # --- Pass 3: Fused Gating and Matmul ---
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_norm = (a - mean[:, None]) * rstd[:, None]
-        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
-        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
-        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
-        proj_ptrs = ProjW_ptr + \
-                    offs_n[None, :] * stride_proj_d + \
-                    offs_k[:, None] * stride_proj_h
-        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
-        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_gated = a_norm * gate
-        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
-        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
-    # --- Store Final Output ---
-    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
-    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
-    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
-def compiledtrimul_fused_interleaved(
-    x: torch.Tensor,
-    mask_mh: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    W_4way: torch.Tensor, # Use the new weight matrices
-    W_og: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int,
-):
-    bs, s1, s2, d = x.shape
-    M, K, H = bs * s1 * s2, x.shape[-1], h
-    x_flat = x.view(M, K)
-    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
-    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
-    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
-    # The grid is launched for the larger 4*H problem
-    N_4way = 4 * H
-    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
-    fused_ln_dual_matmul_kernel[grid](
-        # Pointers (9)
-        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
-        left_final, right_final_t, og_mh,
-        # Metadata (5) - M, H, K, s1, s2
-        M, H, K, s1, s2,
-        # Strides (16)
-        x_flat.stride(0), x_flat.stride(1),
-        W_4way.stride(0), W_4way.stride(1),
-        W_og.stride(0), W_og.stride(1),
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        og_mh.stride(0), og_mh.stride(1),
-        mask_mh.stride(0), mask_mh.stride(1),
-        # Constexpr (1)
-        LN_EPS=1e-5
-    )
-    return torch_pt2(
-        left_final, right_final_t,
-        bs=bs,
-        s1=s1,
-        s2=s2,
-        d=d,
-        h=h,
-        to_out_norm_weight=to_out_norm_weight,
-        to_out_norm_bias=to_out_norm_bias,
-        og_mh=og_mh,
-        to_out_weight=to_out_weight
-    )
-def pack_w_4way_efficient(weights):
-    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
-    WL = weights['left_proj.weight']
-    WLG = weights['left_gate.weight']
-    WR = weights['right_proj.weight']
-    WRG = weights['right_gate.weight']
-    H, K = WL.shape
-    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2)
-    ws = ws.contiguous().view(4 * H, K)
-    return ws.t().to(torch.float16)
-def get_w_og(weights):
-    """ Gets the transposed [K, H] out_gate weight matrix. """
-    WOG = weights['out_gate.weight']
-    return WOG.t().to(torch.float16)
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@torch.compile
-def compiledtrimul(
-    x: torch.Tensor,
-    mask: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    w_concat: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int
-) -> torch.Tensor:
-    """
-    A barebones, compiled PyTorch function for the TriMul logic.
-    """
-    bs, s1, s2, d = x.shape
-    # Initial LayerNorm
-    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
-    # Single large matmul: [M, d] @ [d, 5h] = [M, 5h]
-    all_projections = torch.mm(x_norm, w_concat)
-    # Split back into individual projections
-    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
-    # Apply mask and gates
-    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
-    left = left * mask_expanded * torch.sigmoid(lg)
-    right = right * mask_expanded * torch.sigmoid(rg)
-    out_gate = torch.sigmoid(og)
-    # Reshape for einsum
-    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
-    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
-    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
-    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
-    # Apply layer norm and final gating
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * out_gate
-    # Final projection
-    final_out_flat = gated @ to_out_weight.t()
-    final_out = final_out_flat.view(bs, s1, s2, d)
-    return final_out
-def small_kernel_pt_path(data):
-    input_tensor, mask, weights, config = data
-    w_concat = torch.cat([
-        weights['left_proj.weight'],
-        weights['right_proj.weight'],
-        weights['left_gate.weight'],
-        weights['right_gate.weight'],
-        weights['out_gate.weight']
-    ], dim=0).t().contiguous().to(torch.float16)
-    # Call the compiled function with prepared weights
-    output = compiledtrimul(
-        x=input_tensor.to(torch.float32),
-        mask=mask.unsqueeze(-1),
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        w_concat=w_concat,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float32),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float32),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=config["hidden_dim"]
-    )
-    return output
-def kernel_h100(data):
-    input_tensor, mask, weights, config = data
-    bs, s1, s2, d = input_tensor.shape
-    if s1 <= 512:
-        return small_kernel_pt_path(data)
-    H = config["hidden_dim"]
-    W_4way = pack_w_4way_efficient(weights)
-    W_og = get_w_og(weights)
-    M = bs * s1 * s2
-    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16) #move into kernel possibly
-    return compiledtrimul_fused_interleaved(
-        x=input_tensor.to(torch.float32),
-        mask_mh=mask_mh,
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        W_4way=W_4way, # Pass the new 4-way matrix
-        W_og=W_og,     # Pass the new out_gate matrix
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=H,
-    )

build/torch-rocm/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-from .triton_a100 import kernel_a100
-from .triton_h100 import kernel_h100
-from .triton_b200 import kernel_b200
-from .trimul_mi300 import kernel_mi300
-from .trimul_global import kernel_global
-__all__ = ["kernel_a100", "kernel_h100", "kernel_b200", "kernel_mi300", "kernel_global"]

build/torch-rocm/_ops.py DELETED Viewed

@@ -1,8 +0,0 @@
-import torch
-ops = torch.ops._trimul_gpumode_8e6e60d
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_trimul_gpumode_8e6e60d::{op_name}"

build/torch-rocm/metadata.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"python-depends":[]}

build/torch-rocm/task.py DELETED Viewed

@@ -1,20 +0,0 @@
-"""
-Type definitions for TriMul task.
-Input: Tuple of (input_tensor, mask, weights, config)
-  - input_tensor: Input tensor of shape [batch_size, seq_len, seq_len, dim]
-  - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
-  - weights: Dictionary containing model weights
-  - config: Dictionary containing model configuration parameters
-Output: Output tensor of shape [batch_size, seq_len, seq_len, dim]
-"""
-import torch
-from typing import Tuple, Dict, Any
-# Input type: (input_tensor, mask, weights, config)
-input_t = Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], Dict[str, Any]]
-# Output type: output tensor
-output_t = torch.Tensor

build/torch-rocm/trimul_global.py DELETED Viewed

@@ -1,971 +0,0 @@
-# from utils import make_match_reference, DisableCuDNNTF32
-from .task import input_t, output_t
-import torch
-from torch import nn, einsum
-import math
-import os
-import requests
-import triton
-import triton.language as tl
-# The flag below controls whether to allow TF32 on matmul. This flag defaults to False
-# in PyTorch 1.12 and later.
-torch.backends.cuda.matmul.allow_tf32 = True
-# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
-torch.backends.cudnn.allow_tf32 = True
-# Set allocator for TMA descriptors (required for on-device TMA)
-def alloc_fn(size: int, alignment: int, stream=None):
-    return torch.empty(size, device="cuda", dtype=torch.int8)
-triton.set_allocator(alloc_fn)
-# os.environ['TRITON_PRINT_AUTOTUNING'] = '1'
-# os.environ['MLIR_ENABLE_DIAGNOSTICS'] = 'warnings,remarks'
-# Reference code in PyTorch
-class TriMul(nn.Module):
-    # Based on https://github.com/lucidrains/triangle-multiplicative-module/blob/main/triangle_multiplicative_module/triangle_multiplicative_module.py
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-    ):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim)
-        self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
-        self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
-        self.left_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.to_out_norm = nn.LayerNorm(hidden_dim)
-        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
-    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        """
-        x: [bs, seq_len, seq_len, dim]
-        mask: [bs, seq_len, seq_len]
-        Returns:
-            output: [bs, seq_len, seq_len, dim]
-        """
-        batch_size, seq_len, _, dim = x.shape
-        x = self.norm(x)
-        left = self.left_proj(x)
-        right = self.right_proj(x)
-        mask = mask.unsqueeze(-1)
-        left = left * mask
-        right = right * mask
-        left_gate = self.left_gate(x).sigmoid()
-        right_gate = self.right_gate(x).sigmoid()
-        out_gate = self.out_gate(x).sigmoid()
-        left = left * left_gate
-        right = right * right_gate
-        out = einsum('... i k d, ... j k d -> ... i j d', left, right)
-        # This einsum is the same as the following:
-        # out = torch.zeros(batch_size, seq_len, seq_len, dim, device=x.device)
-        # # Compute using nested loops
-        # for b in range(batch_size):
-        #     for i in range(seq_len):
-        #         for j in range(seq_len):
-        #             # Compute each output element
-        #             for k in range(seq_len):
-        #                 out[b, i, j] += left[b, i, k, :] * right[b, j, k, :]
-        out = self.to_out_norm(out)
-        out = out * out_gate
-        return self.to_out(out)
-@triton.jit
-def triton_sigmoid(x):
-    """
-    Compute sigmoid function: 1 / (1 + exp(-x))
-    """
-    return 1.0 / (1.0 + tl.exp(-x))
-def two_mm_kernel_configs_wrapper():
-    if torch.cuda.get_device_capability() == (12, 0):
-        def two_mm_kernel_configs():
-            configs = []
-            for BLOCK_M in [16, 32]:
-                for BLOCK_N in [16, 32, 64]:
-                    for BLOCK_K in [16, 32, 64]:
-                        for num_stages in [2, 3]:
-                            configs.append(triton.Config({
-                                'BLOCK_M': BLOCK_M,
-                                'BLOCK_N': BLOCK_N,
-                                'BLOCK_K': BLOCK_K,
-                                'GROUP_SIZE_M': 8
-                            }, num_stages=num_stages, num_warps=8))
-            return configs
-    elif torch.cuda.get_device_capability()[0] == 9:
-        def get_optimal_two_mm_config_h100(B, seq_len, dim):
-            configs = {
-                (1, 128, 128): (128, 64, 128, 2, 8),
-                (1, 128, 256): (128, 64, 128, 2, 8),
-                (1, 128, 384): (128, 64, 64, 3, 8),
-                (1, 128, 512): (128, 64, 64, 3, 8),
-                (1, 128, 768): (128, 64, 64, 3, 8),
-                (1, 128, 1024): (128, 64, 64, 3, 8),
-                (1, 256, 128): (128, 64, 128, 2, 8),
-                (1, 256, 256): (128, 64, 128, 2, 8),
-                (1, 256, 384): (128, 64, 64, 3, 8),
-                (1, 256, 512): (128, 64, 64, 3, 8),
-                (1, 256, 768): (128, 64, 64, 3, 8),
-                (1, 256, 1024): (128, 64, 64, 3, 8),
-                (1, 512, 128): (128, 64, 128, 2, 8),
-                (1, 512, 256): (128, 64, 128, 2, 8),
-                (1, 512, 384): (128, 64, 128, 2, 8),
-                (1, 512, 512): (128, 64, 128, 2, 8),
-                (1, 512, 768): (128, 64, 64, 3, 8),
-                (1, 512, 1024): (128, 64, 64, 3, 8),
-                (1, 1024, 128): (128, 64, 128, 2, 8),
-                (1, 1024, 256): (128, 64, 64, 2, 8),
-                (1, 1024, 384): (128, 64, 128, 2, 8),
-                (1, 1024, 512): (128, 64, 128, 2, 8),
-                (1, 1024, 768): (128, 64, 128, 2, 8),
-                (1, 1024, 1024): (128, 64, 128, 2, 8),
-                (2, 128, 128): (128, 64, 128, 2, 8),
-                (2, 128, 256): (128, 64, 128, 2, 8),
-                (2, 128, 384): (128, 64, 64, 3, 8),
-                (2, 128, 512): (128, 64, 64, 3, 8),
-                (2, 128, 768): (128, 64, 64, 3, 8),
-                (2, 128, 1024): (128, 64, 64, 3, 8),
-                (2, 256, 128): (128, 64, 128, 2, 8),
-                (2, 256, 256): (128, 64, 128, 2, 8),
-                (2, 256, 384): (128, 64, 128, 2, 8),
-                (2, 256, 512): (128, 64, 128, 2, 8),
-                (2, 256, 768): (128, 64, 64, 3, 8),
-                (2, 256, 1024): (128, 64, 64, 3, 8),
-                (2, 512, 128): (128, 64, 128, 2, 8),
-                (2, 512, 256): (128, 64, 128, 2, 8),
-                (2, 512, 384): (128, 64, 128, 2, 8),
-                (2, 512, 512): (128, 64, 128, 2, 8),
-                (2, 512, 768): (128, 64, 128, 2, 8),
-                (2, 512, 1024): (128, 64, 128, 2, 8),
-                (2, 1024, 128): (128, 64, 128, 2, 8),
-                (2, 1024, 256): (128, 64, 128, 2, 8),
-                (2, 1024, 384): (128, 64, 128, 2, 8),
-                (2, 1024, 512): (128, 64, 128, 2, 8),
-                (2, 1024, 768): (128, 64, 128, 2, 8),
-                (2, 1024, 1024): (128, 64, 128, 2, 8),
-            }
-            return configs.get((B, seq_len, dim), (64, 64, 32, 2, 8))  # default fallback
-        def two_mm_kernel_configs():
-            # This function is kept for compatibility but will be overridden for H100
-            return [
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-            ]
-    elif torch.cuda.get_device_capability()[0] == 10 and False:
-        def get_optimal_two_mm_config(B, seq_len, dim):
-            configs = {
-                (1, 128, 128): (64, 128, 64, 2, 8),
-                (1, 128, 256): (128, 64, 128, 2, 8),
-                (1, 128, 384): (128, 64, 128, 2, 8),
-                (1, 128, 512): (128, 64, 128, 2, 8),
-                (1, 128, 768): (128, 64, 64, 3, 8),
-                (1, 128, 1024): (128, 64, 64, 3, 8),
-                (1, 256, 128): (128, 64, 128, 2, 8),
-                (1, 256, 256): (128, 64, 128, 2, 8),
-                (1, 256, 384): (128, 64, 128, 2, 8),
-                (1, 256, 512): (128, 64, 64, 3, 8),
-                (1, 256, 768): (128, 64, 64, 3, 8),
-                (1, 256, 1024): (128, 64, 64, 3, 8),
-                (1, 512, 128): (128, 64, 128, 2, 8),
-                (1, 512, 256): (128, 64, 128, 2, 8),
-                (1, 512, 384): (128, 64, 128, 2, 8),
-                (1, 512, 512): (128, 64, 128, 2, 8),
-                (1, 512, 768): (128, 64, 64, 3, 8),
-                (1, 512, 1024): (128, 64, 64, 3, 8),
-                (1, 1024, 128): (128, 64, 128, 2, 8),
-                (1, 1024, 256): (128, 64, 128, 2, 8),
-                (1, 1024, 384): (128, 64, 128, 2, 8),
-                (1, 1024, 512): (128, 64, 128, 2, 8),
-                (1, 1024, 768): (128, 64, 64, 3, 8),
-                (1, 1024, 1024): (128, 64, 64, 3, 8),
-                (2, 128, 128): (128, 64, 128, 2, 8),
-                (2, 128, 256): (128, 64, 128, 2, 8),
-                (2, 128, 384): (128, 64, 128, 2, 8),
-                (2, 128, 512): (128, 64, 64, 3, 8),
-                (2, 128, 768): (128, 64, 64, 3, 8),
-                (2, 128, 1024): (128, 64, 64, 3, 8),
-                (2, 256, 128): (128, 64, 128, 2, 8),
-                (2, 256, 256): (128, 64, 128, 2, 8),
-                (2, 256, 384): (128, 64, 128, 2, 8),
-                (2, 256, 512): (128, 64, 64, 3, 8),
-                (2, 256, 768): (128, 64, 64, 3, 8),
-                (2, 256, 1024): (128, 64, 64, 3, 8),
-                (2, 512, 128): (128, 64, 128, 2, 8),
-                (2, 512, 256): (128, 64, 128, 2, 8),
-                (2, 512, 384): (128, 64, 128, 2, 8),
-                (2, 512, 512): (128, 64, 128, 2, 8),
-                (2, 512, 768): (128, 64, 64, 3, 8),
-                (2, 512, 1024): (128, 64, 64, 3, 8),
-                (2, 1024, 128): (128, 64, 128, 2, 8),
-                (2, 1024, 256): (128, 64, 128, 2, 8),
-                (2, 1024, 384): (128, 64, 128, 2, 8),
-                (2, 1024, 512): (128, 64, 128, 2, 8),
-                (2, 1024, 768): (128, 64, 64, 3, 8),
-                (2, 1024, 1024): (128, 64, 64, 3, 8),
-            }
-            return configs.get((B, seq_len, dim), (64, 64, 32, 2, 8))  # default fallback
-        def two_mm_kernel_configs():
-            # This function is kept for compatibility but will be overridden
-            return [
-                triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-            ]
-    elif torch.cuda.get_device_capability()[0] == 8:
-        # A100
-        def two_mm_kernel_configs():
-            configs = []
-            for BLOCK_M in [64]:
-                for BLOCK_N in [64, 128]:
-                    for BLOCK_K in [16]:
-                        for num_stages in [3, 4]:
-                            for num_warps in [4, 8]:
-                                configs.append(triton.Config({
-                                    'BLOCK_M': BLOCK_M,
-                                    'BLOCK_N': BLOCK_N,
-                                    'BLOCK_K': BLOCK_K,
-                                    'GROUP_SIZE_M': 8
-                                }, num_stages=num_stages, num_warps=num_warps))
-            return configs
-    else:
-        def two_mm_kernel_configs():
-            configs = []
-            for BLOCK_M in [64, 128]:
-                for BLOCK_N in [64, 128]:
-                    for BLOCK_K in [64, 128]:
-                        for num_stages in [2, 3]:
-                            configs.append(triton.Config({
-                                'BLOCK_M': BLOCK_M,
-                                'BLOCK_N': BLOCK_N,
-                                'BLOCK_K': BLOCK_K,
-                                'GROUP_SIZE_M': 8
-                            }, num_stages=num_stages, num_warps=8))
-            return configs
-    return two_mm_kernel_configs
-def two_mm_kernel_wrapper():
-    if torch.cuda.get_device_capability()[0] == 8:
-        @triton.jit
-        def two_mm_kernel(a_ptr, b1_ptr, b2_ptr, b3_ptr, b4_ptr, b5_ptr, c1_ptr, c2_ptr, d_ptr, mask_ptr, M, N, K, stride_a0, stride_a1, stride_a2, stride_a3, stride_bk, stride_bn, stride_c0, stride_c1, stride_c2, stride_c3, seq_len, stride_d0, stride_d1, stride_d2, stride_d3, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
-            # Persistent kernel using standard tl.load operations
-            start_pid = tl.program_id(axis=0)
-            num_pid_m = tl.cdiv(M, BLOCK_M)
-            num_pid_n = tl.cdiv(N, BLOCK_N)
-            k_tiles = tl.cdiv(K, BLOCK_K)
-            num_tiles = num_pid_m * num_pid_n
-            # tile_id_c is used in the epilogue to break the dependency between
-            # the prologue and the epilogue
-            tile_id_c = start_pid - NUM_SMS
-            num_pid_in_group = GROUP_SIZE_M * num_pid_n
-            # Persistent loop over tiles
-            for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=False):
-                # Calculate PID for this tile using improved swizzling
-                group_id = tile_id // num_pid_in_group
-                first_pid_m = group_id * GROUP_SIZE_M
-                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-                pid_m = first_pid_m + (tile_id % group_size_m)
-                pid_n = (tile_id % num_pid_in_group) // group_size_m
-                # Calculate block offsets
-                offs_am = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-                offs_bn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-                offs_k = tl.arange(0, BLOCK_K)
-                # Initialize accumulators for all outputs
-                accumulator1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator3 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator4 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator_d = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                # Main computation loop over K dimension
-                for ki in range(k_tiles):
-                    k_start = ki * BLOCK_K
-                    k_offsets = k_start + offs_k
-                    # Create pointers for A matrix (2D flattened view)
-                    a_ptrs = a_ptr + offs_am[:, None] * stride_a2 + k_offsets[None, :] * stride_a3
-                    a_mask = (offs_am[:, None] < M) & (k_offsets[None, :] < K)
-                    # Create pointers for B matrices [N, K] layout
-                    b1_ptrs = b1_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b2_ptrs = b2_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b3_ptrs = b3_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b4_ptrs = b4_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b5_ptrs = b5_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b_mask = (offs_bn[:, None] < N) & (k_offsets[None, :] < K)
-                    # Load blocks from A and all weight matrices using standard tl.load
-                    a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-                    b1 = tl.load(b1_ptrs, mask=b_mask, other=0.0)
-                    b2 = tl.load(b2_ptrs, mask=b_mask, other=0.0)
-                    b3 = tl.load(b3_ptrs, mask=b_mask, other=0.0)
-                    b4 = tl.load(b4_ptrs, mask=b_mask, other=0.0)
-                    b5 = tl.load(b5_ptrs, mask=b_mask, other=0.0)
-                    # Perform matrix multiplications using TF32
-                    accumulator1 = tl.dot(a, b1.T, accumulator1, allow_tf32=True)  # A @ B1.T
-                    accumulator2 = tl.dot(a, b2.T, accumulator2, allow_tf32=True)  # A @ B2.T
-                    accumulator3 = tl.dot(a, b3.T, accumulator3, allow_tf32=True)  # A @ B3.T
-                    accumulator4 = tl.dot(a, b4.T, accumulator4, allow_tf32=True)  # A @ B4.T
-                    accumulator_d = tl.dot(a, b5.T, accumulator_d, allow_tf32=True)  # A @ B5.T
-                # Store results using separate tile_id_c for epilogue
-                tile_id_c += NUM_SMS
-                group_id = tile_id_c // num_pid_in_group
-                first_pid_m = group_id * GROUP_SIZE_M
-                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-                pid_m = first_pid_m + (tile_id_c % group_size_m)
-                pid_n = (tile_id_c % num_pid_in_group) // group_size_m
-                # Calculate output offsets and pointers
-                offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-                offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-                # Create masks for bounds checking
-                d_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-                # Calculate pointer addresses using 4D strides
-                stride_cm = stride_c2  # Stride to next element in flattened M dimension
-                stride_cn = stride_c3  # N is the innermost dimension
-                # For D tensor: use separate D strides
-                stride_dm = stride_d2  # Stride to next element in flattened M dimension
-                stride_dn = stride_d3  # N is the innermost dimension
-                off_c_batch = offs_cm // (seq_len * seq_len)
-                off_c_sl1 = (offs_cm // seq_len) % seq_len
-                off_c_sl2 = offs_cm % seq_len
-                off_c_dim = offs_cn
-                c_offsets = (off_c_batch * stride_c0 + off_c_sl1 * stride_c1 + off_c_sl2 * stride_c2)[:, None] + off_c_dim[None, :] * stride_c3
-                c_mask = d_mask
-                c1_ptrs = c1_ptr + c_offsets
-                c2_ptrs = c2_ptr + c_offsets
-                d_ptrs = d_ptr + stride_dm * offs_cm[:, None] + stride_dn * offs_cn[None, :]
-                mask = tl.load(mask_ptr + offs_cm, mask=(offs_cm < M))
-                # Broadcast mask to match accumulator dimensions [BLOCK_M, BLOCK_N]
-                mask_2d = mask[:, None]  # Convert to [BLOCK_M, 1] then broadcast
-                # Apply masking only to left_proj and right_proj results (C1, C2)
-                accumulator1 = tl.where(mask_2d, accumulator1, 0)
-                accumulator2 = tl.where(mask_2d, accumulator2, 0)
-                # Apply sigmoid to gate values
-                left_gate_sigmoid = triton_sigmoid(accumulator3)
-                right_gate_sigmoid = triton_sigmoid(accumulator4)
-                accumulator_d = triton_sigmoid(accumulator_d)
-                # Apply elementwise multiplication with gated values
-                # C1 = left * left_gate, C2 = right * right_gate
-                accumulator1 = accumulator1 * left_gate_sigmoid  # left * left_gate
-                accumulator2 = accumulator2 * right_gate_sigmoid  # right * right_gate
-                # Convert to appropriate output dtype and store with normal tl.store
-                c1 = accumulator1.to(c1_ptr.dtype.element_ty)
-                c2 = accumulator2.to(c2_ptr.dtype.element_ty)
-                d = accumulator_d.to(d_ptr.dtype.element_ty)
-                tl.store(c1_ptrs, c1, mask=c_mask)
-                tl.store(c2_ptrs, c2, mask=c_mask)
-                tl.store(d_ptrs, d, mask=d_mask)
-    else:
-        @triton.jit
-        def two_mm_kernel(a_ptr, b1_ptr, b2_ptr, b3_ptr, b4_ptr, b5_ptr, c1_ptr, c2_ptr, d_ptr, mask_ptr, M, N, K, stride_a0, stride_a1, stride_a2, stride_a3, stride_bk, stride_bn, stride_c0, stride_c1, stride_c2, stride_c3, seq_len, stride_d0, stride_d1, stride_d2, stride_d3, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
-            # Persistent kernel using on-device TMA descriptors
-            start_pid = tl.program_id(axis=0)
-            num_pid_m = tl.cdiv(M, BLOCK_M)
-            num_pid_n = tl.cdiv(N, BLOCK_N)
-            k_tiles = tl.cdiv(K, BLOCK_K)
-            num_tiles = num_pid_m * num_pid_n
-            # Create on-device TMA descriptors
-            a_desc = tl._experimental_make_tensor_descriptor(
-                a_ptr,
-                shape=[M, K],
-                strides=[stride_a2, stride_a3],
-                block_shape=[BLOCK_M, BLOCK_K],
-            )
-            b1_desc = tl._experimental_make_tensor_descriptor(
-                b1_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            b2_desc = tl._experimental_make_tensor_descriptor(
-                b2_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            b3_desc = tl._experimental_make_tensor_descriptor(
-                b3_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            b4_desc = tl._experimental_make_tensor_descriptor(
-                b4_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            b5_desc = tl._experimental_make_tensor_descriptor(
-                b5_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            # tile_id_c is used in the epilogue to break the dependency between
-            # the prologue and the epilogue
-            tile_id_c = start_pid - NUM_SMS
-            num_pid_in_group = GROUP_SIZE_M * num_pid_n
-            # Persistent loop over tiles
-            for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=False):
-                # Calculate PID for this tile using improved swizzling
-                group_id = tile_id // num_pid_in_group
-                first_pid_m = group_id * GROUP_SIZE_M
-                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-                pid_m = first_pid_m + (tile_id % group_size_m)
-                pid_n = (tile_id % num_pid_in_group) // group_size_m
-                # Calculate block offsets
-                offs_am = pid_m * BLOCK_M
-                offs_bn = pid_n * BLOCK_N
-                # Initialize accumulators for all outputs
-                accumulator1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator3 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator4 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator_d = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                # Main computation loop over K dimension
-                for ki in range(k_tiles):
-                    offs_k = ki * BLOCK_K
-                    # Load blocks from A and all weight matrices using on-device TMA
-                    a = a_desc.load([offs_am, offs_k])
-                    b1 = b1_desc.load([offs_bn, offs_k])
-                    b2 = b2_desc.load([offs_bn, offs_k])
-                    b3 = b3_desc.load([offs_bn, offs_k])
-                    b4 = b4_desc.load([offs_bn, offs_k])
-                    b5 = b5_desc.load([offs_bn, offs_k])
-                    # Perform matrix multiplications using TF32
-                    accumulator1 = tl.dot(a, b1.T, accumulator1, allow_tf32=True)  # A @ B1.T
-                    accumulator2 = tl.dot(a, b2.T, accumulator2, allow_tf32=True)  # A @ B2.T
-                    accumulator3 = tl.dot(a, b3.T, accumulator3, allow_tf32=True)  # A @ B3.T
-                    accumulator4 = tl.dot(a, b4.T, accumulator4, allow_tf32=True)  # A @ B4.T
-                    accumulator_d = tl.dot(a, b5.T, accumulator_d, allow_tf32=True)  # A @ B5.T
-                # Store results using separate tile_id_c for epilogue
-                tile_id_c += NUM_SMS
-                group_id = tile_id_c // num_pid_in_group
-                first_pid_m = group_id * GROUP_SIZE_M
-                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-                pid_m = first_pid_m + (tile_id_c % group_size_m)
-                pid_n = (tile_id_c % num_pid_in_group) // group_size_m
-                # Calculate output offsets and pointers
-                offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-                offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-                # Create masks for bounds checking
-                d_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-                # Calculate pointer addresses using 4D strides
-                # For C tensors: compute effective 2D strides from 4D strides
-                # Output tensor is [B, I, J, N], flattened to [M, N] where M = B*I*J
-                stride_cm = stride_c2  # Stride to next element in flattened M dimension
-                stride_cn = stride_c3  # N is the innermost dimension
-                # For D tensor: use separate D strides
-                stride_dm = stride_d2  # Stride to next element in flattened M dimension
-                stride_dn = stride_d3  # N is the innermost dimension
-                off_c_batch = offs_cm // (seq_len * seq_len)
-                off_c_sl1 = (offs_cm // seq_len) % seq_len
-                off_c_sl2 = offs_cm % seq_len
-                off_c_dim = offs_cn
-                # TODO update the mask_c so we don't IMA
-                c_offsets = (off_c_batch * stride_c0 + off_c_sl1 * stride_c1 + off_c_sl2 * stride_c2)[:, None] + off_c_dim[None, :] * stride_c3
-                # c_offsets = offs_cm[:, None] * stride_c2 + offs_cn[None, :] * stride_c3
-                c_mask = d_mask
-                c1_ptrs = c1_ptr + c_offsets
-                c2_ptrs = c2_ptr + c_offsets
-                # c1_ptrs = c1_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-                # c2_ptrs = c2_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-                d_ptrs = d_ptr + stride_dm * offs_cm[:, None] + stride_dn * offs_cn[None, :]
-                mask = tl.load(mask_ptr + offs_cm, mask=(offs_cm < M))
-                # Broadcast mask to match accumulator dimensions [BLOCK_M, BLOCK_N]
-                mask_2d = mask[:, None]  # Convert to [BLOCK_M, 1] then broadcast
-                # Apply masking only to left_proj and right_proj results (C1, C2)
-                accumulator1 = tl.where(mask_2d, accumulator1, 0)
-                accumulator2 = tl.where(mask_2d, accumulator2, 0)
-                # Apply sigmoid to gate values
-                left_gate_sigmoid = triton_sigmoid(accumulator3)
-                right_gate_sigmoid = triton_sigmoid(accumulator4)
-                accumulator_d = triton_sigmoid(accumulator_d)
-                # Apply elementwise multiplication with gated values
-                # C1 = left * left_gate, C2 = right * right_gate
-                accumulator1 = accumulator1 * left_gate_sigmoid  # left * left_gate
-                accumulator2 = accumulator2 * right_gate_sigmoid  # right * right_gate
-                # Convert to appropriate output dtype and store with normal tl.store
-                c1 = accumulator1.to(c1_ptr.dtype.element_ty)
-                c2 = accumulator2.to(c2_ptr.dtype.element_ty)
-                d = accumulator_d.to(d_ptr.dtype.element_ty)
-                tl.store(c1_ptrs, c1, mask=c_mask)
-                tl.store(c2_ptrs, c2, mask=c_mask)
-                tl.store(d_ptrs, d, mask=d_mask)
-    if torch.cuda.get_device_capability()[0] not in [9, 10.2]:
-        two_mm_kernel = triton.autotune(
-            (two_mm_kernel_configs_wrapper())(), key=["M", "N", "K"]
-        )(two_mm_kernel)
-    return two_mm_kernel
-def two_mm(A, left_proj, right_proj, left_gate, right_gate, out_gate, mask):
-    """
-    Persistent matrix multiplication for all weight matrices using on-device TMA descriptors.
-    Args:
-        A: [..., K] tensor (arbitrary leading dimensions)
-        left_proj: [N, K] matrix (will be transposed)
-        right_proj: [N, K] matrix (will be transposed)
-        left_gate: [N, K] left gate weight matrix
-        right_gate: [N, K] right gate weight matrix
-        out_gate: [N, K] output gate weight matrix
-        mask: mask tensor
-    Returns:
-        (C1, C2, D): Tuple of result tensors [..., N] with same leading dims as A
-            C1 = (A @ left_proj.T) * sigmoid(A @ left_gate.T) (masked)
-            C2 = (A @ right_proj.T) * sigmoid(A @ right_gate.T) (masked)
-            D = sigmoid(A @ out_gate.T) (unmasked)
-    """
-    # Check constraints
-    assert A.shape[-1] == left_proj.shape[1] == right_proj.shape[1], "Incompatible K dimensions"
-    assert A.dtype == left_proj.dtype == right_proj.dtype, "Incompatible dtypes"
-    # Assert that all weight matrices have the same strides (same [N, K] shape)
-    assert left_proj.stride() == right_proj.stride() == left_gate.stride() == right_gate.stride() == out_gate.stride(), \
-        "All weight matrices must have identical strides"
-    # Get dimensions
-    original_shape = A.shape[:-1]  # All dimensions except the last
-    K = A.shape[-1]
-    N = left_proj.shape[0]
-    B, seq_len, _, _ = A.shape
-    dtype = A.dtype
-    # Flatten A to 2D for kernel processing
-    A_2d = A.view(-1, K)  # [M, K] where M is product of all leading dims
-    M = A_2d.shape[0]
-    # Get number of streaming multiprocessors
-    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
-    # Launch persistent kernel with limited number of blocks
-    grid = lambda META: (min(NUM_SMS, triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"])),)
-    # Get original 4D strides for A and output tensors
-    A_strides = A.stride()  # (stride_0, stride_1, stride_2, stride_3)
-    # Create output tensors with proper 4D shape to get correct strides
-    output_shape = original_shape + (N,)
-    # C1 = torch.empty(output_shape, device=A.device, dtype=dtype)
-    # C2 = torch.empty(output_shape, device=A.device, dtype=dtype)
-    C1 = torch.empty(B, N, seq_len, seq_len, device=A.device, dtype=torch.float16).permute(0, 2, 3, 1)
-    C2 = torch.empty(B, N, seq_len, seq_len, device=A.device, dtype=torch.float16).permute(0, 2, 3, 1)
-    D = torch.empty(output_shape, device=A.device, dtype=torch.float16)
-    C_strides = C1.stride()  # (stride_0, stride_1, stride_2, stride_3)
-    D_strides = D.stride()   # (stride_0, stride_1, stride_2, stride_3)
-    # Use optimal configuration for B200/H100 or fallback to autotuning for other GPUs
-    if torch.cuda.get_device_capability()[0] == 10:
-        # Get optimal configuration for B200
-        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps = (two_mm_kernel_configs_wrapper())(B, seq_len, K)
-        grid_size = min(NUM_SMS, triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N))
-        two_mm_kernel_wrapper()[(grid_size,)](
-            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
-            C1, C2, D, mask,
-            M, N, K,
-            *A_strides,  # 4D strides for A
-            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
-            *C_strides,  # 4D strides for C
-            seq_len,
-            *D_strides,  # 4D strides for D
-            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_SIZE_M=8, NUM_SMS=NUM_SMS,
-            num_stages=num_stages, num_warps=num_warps
-        )
-    elif torch.cuda.get_device_capability()[0] == 9:
-        # Get optimal configuration for H100
-        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps = (two_mm_kernel_configs_wrapper())(B, seq_len, K)
-        grid_size = min(NUM_SMS, triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N))
-        two_mm_kernel_wrapper()[(grid_size,)](
-            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
-            C1, C2, D, mask,
-            M, N, K,
-            *A_strides,  # 4D strides for A
-            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
-            *C_strides,  # 4D strides for C
-            seq_len,
-            *D_strides,  # 4D strides for D
-            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_SIZE_M=8, NUM_SMS=NUM_SMS,
-            num_stages=num_stages, num_warps=num_warps
-        )
-    else:
-        # Use autotuning for other GPUs
-        two_mm_kernel_wrapper()[grid](
-            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
-            C1, C2, D, mask,
-            M, N, K,
-            *A_strides,  # 4D strides for A
-            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
-            *C_strides,  # 4D strides for C
-            seq_len,
-            *D_strides,  # 4D strides for D
-            NUM_SMS=NUM_SMS
-        )
-    return C1, C2, D
-def second_layernorm_mul(inp, hidden_dim, weight, bias, mul_operand):
-    ln = torch.nn.functional.layer_norm(inp, (hidden_dim,), eps=1e-5, weight=weight.to(inp.dtype), bias=bias.to(inp.dtype))
-    out = ln * mul_operand
-    return out
-'''
-@triton.autotune(
-    [triton.Config({"ROW_BLOCK_SIZE": 16}, num_warps=4, num_stages=3)],
-    key=["R", "C"]
-)
-'''
-@triton.jit
-def layernorm_kernel_first(
-    X,
-    Y,
-    Weight,
-    Bias,
-    R,
-    C,  # aka "dim"
-    eps,
-    ROW_BLOCK_SIZE: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    row = tl.program_id(0) * ROW_BLOCK_SIZE + tl.arange(0, ROW_BLOCK_SIZE)
-    cols = tl.arange(0, BLOCK_SIZE)
-    mask_row = row < R
-    mask_col = cols < C
-    # Simple indexing for contiguous data
-    x = tl.load(
-        X + row[:, None] * C + cols[None, :],
-        mask=mask_row[:, None] & mask_col[None, :],
-        other=0.0
-    ).to(tl.float32)
-    weight = tl.load(Weight + cols, mask=mask_col, other=0.0).to(tl.float32)
-    bias = tl.load(Bias + cols, mask=mask_col, other=0.0).to(tl.float32)
-    mean = tl.sum(x, axis=1) / C
-    diff = tl.where(mask_row[:, None] & mask_col[None, :], x - mean[:, None], 0)
-    var = tl.sum(diff * diff, axis=1) / C
-    rstd = 1 / tl.sqrt(var + eps)
-    y_hat = (x - mean[:, None]) * rstd[:, None]
-    y = y_hat * weight[None, :] + bias[None, :]
-    tl.store(
-        Y + row[:, None] * C + cols[None, :],
-        y,
-        mask=mask_row[:, None] & mask_col[None, :]
-    )
-def get_optimal_config_ln(dim):
-    config = None
-    if torch.cuda.get_device_capability()[0] == 9:
-        if (dim <= 256):
-            config = (16, 1)
-        elif dim <= 512:
-            config = (16, 2)
-        elif dim <= 1024:
-            config = (16, 4)
-    if not config:
-        config = (16, 4)
-    return config
-def triton_layernorm_first(x, weight, bias, eps=1e-5, num_warps=None, ROW_BLOCK_SIZE=None):
-    B, seq_len, seq_len2, dim = x.shape
-    assert(seq_len == seq_len2)
-    R = B * seq_len * seq_len
-    C = dim
-    out = torch.empty_like(x, dtype=torch.float16)
-    if not num_warps or not ROW_BLOCK_SIZE:
-        ROW_BLOCK_SIZE, num_warps = get_optimal_config_ln(dim)
-    BLOCK_SIZE = triton.next_power_of_2(C)
-    assert(BLOCK_SIZE <= 1024)
-    def grid(meta):
-        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
-    layernorm_kernel_first[grid](
-        x, out, weight, bias,
-        R, C, eps,
-        ROW_BLOCK_SIZE=ROW_BLOCK_SIZE,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
-        num_stages=3
-    )
-    return out
-'''
-def triton_layernorm_first(x, weight, bias, eps=1e-5):
-    B, seq_len, seq_len2, dim = x.shape
-    assert(seq_len == seq_len2)
-    R = B * seq_len * seq_len
-    C = dim
-    out = torch.empty_like(x)
-    BLOCK_SIZE = triton.next_power_of_2(C)
-    assert(BLOCK_SIZE <= 1024)
-    def grid(meta):
-        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
-    layernorm_kernel_first[grid](
-        x, out, weight, bias,
-        R, C, eps,
-        BLOCK_SIZE=BLOCK_SIZE
-    )
-    return out
-'''
-@triton.autotune(
-    [triton.Config({"ROW_BLOCK_SIZE": 16}, num_warps=1, num_stages=3)],
-    key=[]
-)
-@triton.jit
-def layernorm_kernel_eltwise(
-    X,
-    Y,
-    Weight,
-    Bias,
-    OutGate,
-    seq_len,
-    stride_batch,
-    stride_dim,
-    R,
-    C,  # aka "dim"
-    eps,
-    ROW_BLOCK_SIZE: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    row = tl.program_id(0) * ROW_BLOCK_SIZE + tl.arange(0, ROW_BLOCK_SIZE)
-    cols = tl.arange(0, BLOCK_SIZE)
-    # Calculate base pointer for this batch of rows
-    tl.device_assert(seq_len*seq_len % ROW_BLOCK_SIZE == 0)
-    # batch_offset = (row // (stride_seq1 // stride_dim)) * stride_batch
-    batch = tl.program_id(0) * ROW_BLOCK_SIZE // (seq_len * seq_len)
-    seqs_off = row % (seq_len * seq_len) # TODO is this going to prevent vectorization
-    off_r = batch * stride_batch + seqs_off
-    off_c = cols * stride_dim
-    mask_row = row < R
-    mask_col = cols < C
-    out_gate = tl.load(
-        OutGate + row[:, None] * C + cols[None, :],
-        mask = mask_row[:, None] & mask_col[None, :],
-    )
-    x = tl.load(
-        X + off_r[:, None] + off_c[None, :],
-        mask=mask_row[:, None] & mask_col[None, :],
-        other=0.0
-    ).to(tl.float32)
-    weight = tl.load(Weight + cols, mask=mask_col, other=0.0).to(tl.float32)
-    bias = tl.load(Bias + cols, mask=mask_col, other=0.0).to(tl.float32)
-    mean = tl.sum(x, axis=1) / C
-    diff = tl.where(mask_row[:, None] & mask_col[None, :], x - mean[:, None], 0)
-    var = tl.sum(diff * diff, axis=1) / C
-    rstd = 1 / tl.sqrt(var + eps)
-    y_hat = (x - mean[:, None]) * rstd[:, None]
-    y = y_hat * weight[None, :] + bias[None, :]
-    tl.store(
-        Y + row[:, None] * C + cols[None, :],
-        y * out_gate,
-        mask=mask_row[:, None] & mask_col[None, :]
-    )
-def triton_layernorm_eltwise(x, weight, bias, out_gate, eps=1e-5):
-    B, seq_len, seq_len2, dim = x.shape
-    assert(seq_len == seq_len2)
-    R = B * seq_len * seq_len
-    assert(x.stride(3) == seq_len*seq_len)
-    assert(out_gate.is_contiguous())
-    C = dim
-    out = torch.empty_like(out_gate, dtype=torch.float32)
-    BLOCK_SIZE = triton.next_power_of_2(C)
-    assert(BLOCK_SIZE == 128)
-    def grid(meta):
-        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
-    layernorm_kernel_eltwise[grid](
-        x, out, weight, bias, out_gate,
-        seq_len,
-        x.stride(0), x.stride(3),
-        R, C, eps,
-        BLOCK_SIZE=BLOCK_SIZE
-    )
-    return out
-def kernel_global(data: input_t) -> output_t:
-    """
-    Reference implementation of TriMul using PyTorch.
-    Args:
-        data: Tuple of (input: torch.Tensor, mask: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
-            - input: Input tensor of shape [batch_size, seq_len, seq_len, dim]
-            - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
-            - weights: Dictionary containing model weights
-            - config: Dictionary containing model configuration parameters
-    """
-    input_tensor, mask, weights, config = data
-    left_proj_weight = weights["left_proj.weight"].to(torch.float16)
-    right_proj_weight = weights["right_proj.weight"].to(torch.float16)
-    left_gate_weight = weights["left_gate.weight"].to(torch.float16)
-    right_gate_weight = weights["right_gate.weight"].to(torch.float16)
-    out_gate_weight = weights["out_gate.weight"].to(torch.float16)
-    hidden_dim = config["hidden_dim"]
-    # trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
-    x = input_tensor
-    batch_size, seq_len, _, dim = x.shape
-    x = triton_layernorm_first(x, weights['norm.weight'], weights['norm.bias'])
-    # x = torch.nn.functional.layer_norm(x, (dim,), eps=1e-5, weight=weights['norm.weight'], bias=weights['norm.bias'])
-    left, right, out_gate = two_mm(x, left_proj_weight, right_proj_weight, left_gate_weight, right_gate_weight, out_gate_weight, mask)
-    # left = torch.nn.functional.linear(x, weights['left_proj.weight'].to(torch.float16))
-    # right = torch.nn.functional.linear(x, weights['right_proj.weight'].to(torch.float16))
-    # left = left * mask.unsqueeze(-1)
-    # right = right * mask.unsqueeze(-1)
-    '''
-    left = left.to(torch.float32)
-    right = right.to(torch.float32)
-    x = x.to(torch.float32)
-    left_gate = left_gate.sigmoid()
-    right_gate = right_gate.sigmoid()
-    out_gate = out_gate.sigmoid()
-    '''
-    # Elementwise multiplication now handled in kernel
-    # left = left * left_gate
-    # right = right * right_gate
-    # out = einsum('... i k d, ... j k d -> ... i j d', left, right)
-    out = torch.bmm(left.permute(0, 3, 1, 2).view(-1, left.shape[1], left.shape[2]), right.permute(0, 3, 2, 1).view(-1, right.shape[2], right.shape[1]))
-    out = out.view(batch_size, hidden_dim, seq_len, seq_len).permute(0, 2, 3, 1)
-    # out = torch.compile(second_layernorm_mul, dynamic=False)(out, hidden_dim, weights['to_out_norm.weight'], weights['to_out_norm.bias'], out_gate)
-    out = triton_layernorm_eltwise(out, weights['to_out_norm.weight'], weights['to_out_norm.bias'], out_gate)
-    # out = torch.nn.functional.layer_norm(out, (hidden_dim,), eps=1e-5, weight=weights['to_out_norm.weight'].to(out.dtype), bias=weights['to_out_norm.bias'].to(out.dtype))
-    # out = out * out_gate
-    return torch.nn.functional.linear(out, weights['to_out.weight'])
-    '''
-    # Fill in the given weights of the model
-    trimul.norm.weight = nn.Parameter(weights['norm.weight'])
-    trimul.norm.bias = nn.Parameter(weights['norm.bias'])
-    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
-    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
-    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
-    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
-    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
-    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
-    trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'])
-    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
-    output = trimul(input_tensor, mask)
-    return output
-    '''

build/torch-rocm/trimul_gpumode/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch-rocm/trimul_mi300.py DELETED Viewed

@@ -1,524 +0,0 @@
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16},  num_warps=4, num_stages=2),
-        # Configurations with larger block sizes for better data reuse
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=2),
-        # Configurations with deeper K dimension
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=2),
-        # More extreme configurations to test the limits
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=4, num_stages=2),
-        # Configurations with fewer warps
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=2, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=3),
-    ],
-    key=['M', 'N', 'K'],
-)
-@triton.jit
-def fused_ln_dual_matmul_kernel(
-    # Pointers (9)
-    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
-    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
-    # Metadata (5)
-    M, H, K, s1, s2,
-    # Strides (16)
-    stride_x_m, stride_x_k,
-    stride_w4_k, stride_w4_n,
-    stride_wog_k, stride_wog_n,
-    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
-    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
-    stride_og_m, stride_og_h,
-    stride_mask_m, stride_mask_h,
-    # Constexpr (from decorator and kwargs)
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
-):
-    # --- PID Mapping: Based on the LARGER 4*H problem ---
-    pid = tl.program_id(axis=0)
-    N_4way = 4 * H
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    # --- SHARED LayerNorm calculation (done only ONCE) ---
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    m_mask = offs_m < M
-    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
-    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        mean += tl.sum(x_chunk, axis=1)
-    mean /= K
-    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        x_centered = x_chunk - mean[:, None]
-        var += tl.sum(x_centered * x_centered, axis=1)
-    var /= K
-    rstd = 1.0 / tl.sqrt(var + LN_EPS)
-    # --- Matmul Loop 1: For the 4-Way Projections ---
-    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
-    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        k_block_start = k * BLOCK_SIZE_K;
-        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
-        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
-        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
-        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
-        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
-        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
-        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
-        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
-        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
-        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-        accumulator_4way += tl.dot(x_norm_tile, w_tile)
-        #Some threads should calclate out_gate
-        if pid_n * BLOCK_SIZE_N < H:
-            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
-            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
-            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
-            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-            accumulator_og += tl.dot(x_norm_tile, w_tile)
-    if pid_n * BLOCK_SIZE_N < H:
-        og_out = tl.sigmoid(accumulator_og)
-        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
-        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
-        tl.store(outg_ptrs, og_out, mask=og_mask)
-    # --- Fusion Logic for 4-Way Part ---
-    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
-    role_idx = tl.arange(0, 4)[None, None, :]
-    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
-    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
-    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
-    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
-    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
-    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
-    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
-    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
-    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
-    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
-    s1s2 = s1 * s2
-    offs_b  = offs_m // s1s2
-    offs_s1 = (offs_m % s1s2) // s2
-    offs_s2 = offs_m % s2
-    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
-    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
-    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
-    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
-    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
-                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
-    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
-                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1) # s2 offset uses s2 stride, s1 offset uses s1 stride
-    tl.store(outl_ptrs, left_out, mask=m_mask_h)
-    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
-    ],
-    key=['s1', 's2', 'H'],
-)
-@triton.jit
-def bmm_coalesced_kernel(
-    # Pointers
-    Left_ptr, Right_ptr, Out_ptr,
-    # Dimensions
-    bs, s1, s2, H,
-    # Strides
-    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
-    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
-    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
-    # Kernel parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # Grid and program IDs
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    pid_bh = tl.program_id(axis=1)
-    pid_b = pid_bh // H
-    pid_h = pid_bh % H
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
-    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
-        k_start = k * BLOCK_SIZE_K
-        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
-        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
-        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
-        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-        accumulator += tl.dot(a, b)
-    # --- Coalesced Write ---
-    # Write to a standard (bs, H, s1, s1) layout
-    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
-               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
-    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
-    tl.store(out_ptrs, accumulator, mask=c_mask)
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
-    ],
-    key=['H', 'D'],
-)
-@triton.jit
-def fused_final_kernel(
-    # Pointers
-    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
-    # Metadata
-    M, H, D, s1, # M_gate = bs*s1*s2
-    # Strides
-    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
-    stride_gate_m, stride_gate_h,
-    stride_proj_d, stride_proj_h,
-    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
-    # Constants
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # --- Grid and PID Setup for Matmul ---
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    m_mask = offs_m < M
-    # Decompose M back to (b, r, c) for reordering lookups
-    s1s1 = s1 * s1
-    b = offs_m // s1s1
-    r = (offs_m % s1s1) // s1
-    c = offs_m % s1
-    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
-        # Accumulate sum and sum of squares in one pass
-        sum_x += tl.sum(in_chunk, axis=1)
-        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
-    # Finalize statistics
-    mean = sum_x / H
-    var = (sum_x2 / H) - (mean * mean)
-    rstd = tl.math.rsqrt(var + LN_EPS)
-    # --- Pass 3: Fused Gating and Matmul ---
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_norm = (a - mean[:, None]) * rstd[:, None]
-        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
-        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
-        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
-        proj_ptrs = ProjW_ptr + \
-                    offs_n[None, :] * stride_proj_d + \
-                    offs_k[:, None] * stride_proj_h
-        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
-        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_gated = a_norm * gate
-        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
-        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
-    # --- Store Final Output ---
-    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
-    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
-    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
-def compiledtrimul_fused_interleaved(
-    x: torch.Tensor,
-    mask_mh: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    W_4way: torch.Tensor, # Use the new weight matrices
-    W_og: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int,
-):
-    bs, s1, s2, d = x.shape
-    M, K, H = bs * s1 * s2, x.shape[-1], h
-    x_flat = x.view(M, K)
-    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
-    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
-    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
-    # The grid is launched for the larger 4*H problem
-    N_4way = 4 * H
-    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
-    fused_ln_dual_matmul_kernel[grid](
-        # Pointers (9)
-        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
-        left_final, right_final_t, og_mh,
-        # Metadata (5) - M, H, K, s1, s2
-        M, H, K, s1, s2,
-        # Strides (16)
-        x_flat.stride(0), x_flat.stride(1),
-        W_4way.stride(0), W_4way.stride(1),
-        W_og.stride(0), W_og.stride(1),
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        og_mh.stride(0), og_mh.stride(1),
-        mask_mh.stride(0), mask_mh.stride(1),
-        # Constexpr (1)
-        LN_EPS=1e-5
-    )
-    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
-    grid_bmm = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
-    bmm_coalesced_kernel[grid_bmm](
-        left_final, right_final_t, bmm_out_tmp,
-        bs, s1, s2, H,
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-    )
-    # --- Kernel 3: Fully Fused Final Stage ---
-    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
-    grid_final = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
-    fused_final_kernel[grid_final](
-        # Pointers
-        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
-        # Metadata
-        M, H, d, s1,
-        # Strides
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        og_mh.stride(0), og_mh.stride(1),
-        to_out_weight.stride(0), to_out_weight.stride(1), # Use strides of the corrected tensor
-        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
-        # Constants
-        LN_EPS=1e-5,
-    )
-    return final_out
-def pack_w_4way_efficient(weights):
-    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
-    WL = weights['left_proj.weight']
-    WLG = weights['left_gate.weight']
-    WR = weights['right_proj.weight']
-    WRG = weights['right_gate.weight']
-    H, K = WL.shape
-    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2)
-    ws = ws.contiguous().view(4 * H, K)
-    return ws.t().to(torch.float16)
-def get_w_og(weights):
-    """ Gets the transposed [K, H] out_gate weight matrix. """
-    WOG = weights['out_gate.weight']
-    return WOG.t().to(torch.float16)
-def compiledtrimul(
-    x: torch.Tensor,
-    mask: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    w_concat: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int
-) -> torch.Tensor:
-    """
-    A barebones, compiled PyTorch function for the TriMul logic.
-    """
-    bs, s1, s2, d = x.shape
-    # Initial LayerNorm
-    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
-    # Single large matmul: [M, d] @ [d, 5h] = [M, 5h]
-    all_projections = torch.mm(x_norm, w_concat)
-    # Split back into individual projections
-    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
-    # Apply mask and gates
-    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
-    left = left * mask_expanded * torch.sigmoid(lg)
-    right = right * mask_expanded * torch.sigmoid(rg)
-    out_gate = torch.sigmoid(og)
-    # Reshape for einsum
-    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
-    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
-    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
-    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
-    # Apply layer norm and final gating
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * out_gate
-    # Final projection
-    final_out_flat = gated @ to_out_weight.t()
-    final_out = final_out_flat.view(bs, s1, s2, d)
-    return final_out
-def small_kernel_pt_path(data):
-    input_tensor, mask, weights, config = data
-    w_concat = torch.cat([
-        weights['left_proj.weight'],
-        weights['right_proj.weight'],
-        weights['left_gate.weight'],
-        weights['right_gate.weight'],
-        weights['out_gate.weight']
-    ], dim=0).t().contiguous().to(torch.float16)
-    # Call the compiled function with prepared weights
-    output = compiledtrimul(
-        x=input_tensor.to(torch.float32),
-        mask=mask.unsqueeze(-1),
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        w_concat=w_concat,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=config["hidden_dim"]
-    )
-    return output
-def kernel_mi300(data):
-    input_tensor, mask, weights, config = data
-    bs, s1, s2, d = input_tensor.shape
-    if s1 < 100:
-        return small_kernel_pt_path(data)
-    H = config["hidden_dim"]
-    W_4way = pack_w_4way_efficient(weights)
-    W_og = get_w_og(weights)
-    M = bs * s1 * s2
-    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16) #move into kernel possibly
-    return compiledtrimul_fused_interleaved(
-        x=input_tensor.to(torch.float32),
-        mask_mh=mask_mh,
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        W_4way=W_4way, # Pass the new 4-way matrix
-        W_og=W_og,     # Pass the new out_gate matrix
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=H,
-    )

build/torch-rocm/triton_a100.py DELETED Viewed

@@ -1,405 +0,0 @@
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-# Set PyTorch flags for performance
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@triton.jit
-def fused_ln_dual_matmul_kernel(
-    # Pointers (9)
-    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
-    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
-    # Metadata (5)
-    M, H, K, s1, s2,
-    # Strides (16)
-    stride_x_m, stride_x_k,
-    stride_w4_k, stride_w4_n,
-    stride_wog_k, stride_wog_n,
-    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
-    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
-    stride_og_m, stride_og_h,
-    stride_mask_m, stride_mask_h,
-    # Constexpr (now passed as arguments from the host)
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
-):
-    # --- PID Mapping: Based on the LARGER 4*H problem ---
-    pid = tl.program_id(axis=0)
-    N_4way = 4 * H
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    # --- SHARED LayerNorm calculation (done only ONCE) ---
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    m_mask = offs_m < M
-    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
-    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        mean += tl.sum(x_chunk, axis=1)
-    mean /= K
-    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        x_centered = x_chunk - mean[:, None]
-        var += tl.sum(x_centered * x_centered, axis=1)
-    var /= K
-    rstd = 1.0 / tl.sqrt(var + LN_EPS)
-    # --- Matmul Loop 1: For the 4-Way Projections ---
-    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
-    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        k_block_start = k * BLOCK_SIZE_K;
-        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
-        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
-        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
-        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
-        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
-        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
-        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
-        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
-        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
-        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-        accumulator_4way += tl.dot(x_norm_tile, w_tile)
-        if pid_n * BLOCK_SIZE_N < H:
-            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
-            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
-            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
-            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-            accumulator_og += tl.dot(x_norm_tile, w_tile)
-    if pid_n * BLOCK_SIZE_N < H:
-        og_out = tl.sigmoid(accumulator_og)
-        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
-        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
-        tl.store(outg_ptrs, og_out, mask=og_mask)
-    # --- Fusion Logic for 4-Way Part ---
-    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
-    role_idx = tl.arange(0, 4)[None, None, :]
-    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
-    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
-    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
-    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
-    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
-    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
-    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
-    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
-    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
-    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
-    s1s2 = s1 * s2
-    offs_b  = offs_m // s1s2
-    offs_s1 = (offs_m % s1s2) // s2
-    offs_s2 = offs_m % s2
-    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
-    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
-    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
-    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
-    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
-                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
-    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
-                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1)
-    tl.store(outl_ptrs, left_out, mask=m_mask_h)
-    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
-@triton.jit
-def bmm_coalesced_kernel(
-    # Pointers
-    Left_ptr, Right_ptr, Out_ptr,
-    # Dimensions
-    bs, s1, s2, H,
-    # Strides
-    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
-    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
-    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
-    # Kernel parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    pid_bh = tl.program_id(axis=1)
-    pid_b = pid_bh // H
-    pid_h = pid_bh % H
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
-    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
-        k_start = k * BLOCK_SIZE_K
-        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
-        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
-        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
-        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-        accumulator += tl.dot(a, b)
-    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
-               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
-    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
-    tl.store(out_ptrs, accumulator, mask=c_mask)
-@triton.jit
-def fused_final_kernel(
-    # Pointers
-    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
-    # Metadata
-    M, H, D, s1,
-    # Strides
-    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
-    stride_gate_m, stride_gate_h,
-    stride_proj_d, stride_proj_h,
-    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
-    # Constants
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    m_mask = offs_m < M
-    s1s1 = s1 * s1
-    b = offs_m // s1s1
-    r = (offs_m % s1s1) // s1
-    c = offs_m % s1
-    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
-        sum_x += tl.sum(in_chunk, axis=1)
-        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
-    mean = sum_x / H
-    var = (sum_x2 / H) - (mean * mean)
-    rstd = tl.math.rsqrt(var + LN_EPS)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_norm = (a - mean[:, None]) * rstd[:, None]
-        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
-        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
-        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
-        proj_ptrs = ProjW_ptr + offs_n[None, :] * stride_proj_d + offs_k[:, None] * stride_proj_h
-        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
-        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_gated = a_norm * gate
-        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
-        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
-    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
-    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
-    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
-def compiledtrimul_fused_interleaved_final(
-    x: torch.Tensor,
-    mask_mh: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    W_4way: torch.Tensor,
-    W_og: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int,
-):
-    bs, s1, s2, d = x.shape
-    M, K, H = bs * s1 * s2, x.shape[-1], h
-    x_flat = x.view(M, K)
-    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
-    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
-    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
-    # --- Kernel 1: Fused LN + Dual Matmul ---
-    N_4way = 4 * H
-    # Hardcoded A100 best config: M128-N128-K32-GM8-HC32-W8-S2
-    config_k1 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}
-    grid_k1 = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
-    fused_ln_dual_matmul_kernel[grid_k1](
-        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
-        left_final, right_final_t, og_mh,
-        M, H, K, s1, s2,
-        x_flat.stride(0), x_flat.stride(1), W_4way.stride(0), W_4way.stride(1),
-        W_og.stride(0), W_og.stride(1), left_final.stride(0), left_final.stride(1),
-        left_final.stride(2), left_final.stride(3), right_final_t.stride(0), right_final_t.stride(1),
-        right_final_t.stride(2), right_final_t.stride(3), og_mh.stride(0), og_mh.stride(1),
-        mask_mh.stride(0), mask_mh.stride(1),
-        LN_EPS=1e-5, **config_k1, num_warps=8, num_stages=2
-    )
-    # --- Kernel 2: Batched Matrix Multiplication ---
-    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
-    # Hardcoded A100 best config: M128-N64-K32-GM8-W4-S3
-    config_k2 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
-    grid_k2 = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
-    bmm_coalesced_kernel[grid_k2](
-        left_final, right_final_t, bmm_out_tmp,
-        bs, s1, s2, H,
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        **config_k2, num_warps=4, num_stages=3
-    )
-    # --- Kernel 3: Fully Fused Final Stage ---
-    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
-    # Hardcoded A100 best config: M32-N128-K32-GM8-W4-S3
-    config_k3 = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
-    grid_k3 = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
-    fused_final_kernel[grid_k3](
-        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
-        M, H, d, s1,
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        og_mh.stride(0), og_mh.stride(1), to_out_weight.stride(0), to_out_weight.stride(1),
-        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
-        LN_EPS=1e-5, **config_k3, num_warps=4, num_stages=3
-    )
-    return final_out
-def pack_w_4way_efficient(weights):
-    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
-    WL, WLG, WR, WRG = (weights[k] for k in ['left_proj.weight', 'left_gate.weight', 'right_proj.weight', 'right_gate.weight'])
-    H, K = WL.shape
-    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2).contiguous().view(4 * H, K)
-    return ws.t().to(torch.float16)
-def get_w_og(weights):
-    """ Gets the transposed [K, H] out_gate weight matrix. """
-    return weights['out_gate.weight'].t().to(torch.float16)
-@torch.compile()
-def compiledtrimul(
-    x: torch.Tensor, mask: torch.Tensor, norm_weight: torch.Tensor, norm_bias: torch.Tensor,
-    w_concat: torch.Tensor, to_out_norm_weight: torch.Tensor, to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor, h: int
-) -> torch.Tensor:
-    bs, s1, s2, d = x.shape
-    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
-    all_projections = torch.mm(x_norm, w_concat)
-    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
-    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
-    left = left * mask_expanded * torch.sigmoid(lg)
-    right = right * mask_expanded * torch.sigmoid(rg)
-    out_gate = torch.sigmoid(og)
-    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
-    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
-    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
-    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * out_gate
-    final_out_flat = gated @ to_out_weight.t()
-    return final_out_flat.view(bs, s1, s1, d)
-def small_kernel_pt_path(data):
-    input_tensor, mask, weights, config = data
-    w_concat = torch.cat([
-        weights['left_proj.weight'], weights['right_proj.weight'], weights['left_gate.weight'],
-        weights['right_gate.weight'], weights['out_gate.weight']
-    ], dim=0).t().contiguous().to(torch.float16)
-    return compiledtrimul(
-        x=input_tensor.to(torch.float32), mask=mask.unsqueeze(-1),
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32), w_concat=w_concat,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=config["hidden_dim"]
-    )
-def kernel_a100(data):
-    input_tensor, mask, weights, config = data
-    bs, s1, s2, d = input_tensor.shape
-    if s1 < 512: # Adjusted threshold based on observed BMM configs
-        return small_kernel_pt_path(data)
-    H = config["hidden_dim"]
-    W_4way = pack_w_4way_efficient(weights)
-    W_og = get_w_og(weights)
-    M = bs * s1 * s2
-    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16)
-    return compiledtrimul_fused_interleaved_final(
-        x=input_tensor.to(torch.float32),
-        mask_mh=mask_mh,
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        W_4way=W_4way,
-        W_og=W_og,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=H,
-    )

build/torch-rocm/triton_b200.py DELETED Viewed

@@ -1,411 +0,0 @@
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@triton.jit
-def fused_ln_dual_matmul_kernel(
-    # Pointers (9)
-    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
-    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
-    # Metadata (5)
-    M, H, K, s1, s2,
-    # Strides (16)
-    stride_x_m, stride_x_k,
-    stride_w4_k, stride_w4_n,
-    stride_wog_k, stride_wog_n,
-    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
-    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
-    stride_og_m, stride_og_h,
-    stride_mask_m, stride_mask_h,
-    # Constexpr (now passed as arguments from the host)
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
-):
-    # --- PID Mapping: Based on the LARGER 4*H problem ---
-    pid = tl.program_id(axis=0)
-    N_4way = 4 * H
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    # --- SHARED LayerNorm calculation (done only ONCE) ---
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    m_mask = offs_m < M
-    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
-    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        mean += tl.sum(x_chunk, axis=1)
-    mean /= K
-    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        x_centered = x_chunk - mean[:, None]
-        var += tl.sum(x_centered * x_centered, axis=1)
-    var /= K
-    rstd = 1.0 / tl.sqrt(var + LN_EPS)
-    # --- Matmul Loop 1: For the 4-Way Projections ---
-    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
-    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        k_block_start = k * BLOCK_SIZE_K;
-        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
-        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
-        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
-        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
-        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
-        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
-        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
-        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
-        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
-        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-        accumulator_4way += tl.dot(x_norm_tile, w_tile)
-        #Some threads should calclate out_gate
-        if pid_n * BLOCK_SIZE_N < H:
-            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
-            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
-            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
-            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-            accumulator_og += tl.dot(x_norm_tile, w_tile)
-    if pid_n * BLOCK_SIZE_N < H:
-        og_out = tl.sigmoid(accumulator_og)
-        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
-        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
-        tl.store(outg_ptrs, og_out, mask=og_mask)
-    # --- Fusion Logic for 4-Way Part ---
-    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
-    role_idx = tl.arange(0, 4)[None, None, :]
-    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
-    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
-    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
-    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
-    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
-    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
-    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
-    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
-    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
-    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
-    s1s2 = s1 * s2
-    offs_b  = offs_m // s1s2
-    offs_s1 = (offs_m % s1s2) // s2
-    offs_s2 = offs_m % s2
-    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
-    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
-    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
-    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
-    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
-                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
-    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
-                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1)
-    tl.store(outl_ptrs, left_out, mask=m_mask_h)
-    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
-@triton.jit
-def bmm_coalesced_kernel(
-    # Pointers
-    Left_ptr, Right_ptr, Out_ptr,
-    # Dimensions
-    bs, s1, s2, H,
-    # Strides
-    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
-    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
-    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
-    # Kernel parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # Grid and program IDs
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    pid_bh = tl.program_id(axis=1)
-    pid_b = pid_bh // H
-    pid_h = pid_bh % H
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
-    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
-        k_start = k * BLOCK_SIZE_K
-        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
-        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
-        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
-        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-        accumulator += tl.dot(a, b)
-    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
-               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
-    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
-    tl.store(out_ptrs, accumulator, mask=c_mask)
-@triton.jit
-def fused_final_kernel(
-    # Pointers
-    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
-    # Metadata
-    M, H, D, s1,
-    # Strides
-    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
-    stride_gate_m, stride_gate_h,
-    stride_proj_d, stride_proj_h,
-    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
-    # Constants
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    m_mask = offs_m < M
-    s1s1 = s1 * s1
-    b = offs_m // s1s1
-    r = (offs_m % s1s1) // s1
-    c = offs_m % s1
-    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
-        sum_x += tl.sum(in_chunk, axis=1)
-        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
-    mean = sum_x / H
-    var = (sum_x2 / H) - (mean * mean)
-    rstd = tl.math.rsqrt(var + LN_EPS)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_norm = (a - mean[:, None]) * rstd[:, None]
-        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
-        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
-        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
-        proj_ptrs = ProjW_ptr + offs_n[None, :] * stride_proj_d + offs_k[:, None] * stride_proj_h
-        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
-        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_gated = a_norm * gate
-        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
-        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
-    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
-    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
-    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
-def compiledtrimul_fused_interleaved_final(
-    x: torch.Tensor,
-    mask_mh: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    W_4way: torch.Tensor,
-    W_og: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int,
-):
-    bs, s1, s2, d = x.shape
-    M, K, H = bs * s1 * s2, x.shape[-1], h
-    x_flat = x.view(M, K)
-    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
-    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
-    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
-    # --- Kernel 1: Fused LN + Dual Matmul ---
-    # The grid is launched for the larger 4*H problem
-    N_4way = 4 * H
-    # Hardcoded best config from logs: M64-N128-K64-GM8-HC32-W4-S2
-    config_k1 = {'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}
-    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
-    fused_ln_dual_matmul_kernel[grid](
-        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
-        left_final, right_final_t, og_mh,
-        M, H, K, s1, s2,
-        x_flat.stride(0), x_flat.stride(1), W_4way.stride(0), W_4way.stride(1),
-        W_og.stride(0), W_og.stride(1), left_final.stride(0), left_final.stride(1),
-        left_final.stride(2), left_final.stride(3), right_final_t.stride(0), right_final_t.stride(1),
-        right_final_t.stride(2), right_final_t.stride(3), og_mh.stride(0), og_mh.stride(1),
-        mask_mh.stride(0), mask_mh.stride(1),
-        LN_EPS=1e-5, **config_k1, num_warps=4, num_stages=2
-    )
-    # --- Kernel 2: Batched Matrix Multiplication ---
-    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
-    # Hardcoded best config from logs: M128-N128-K32-GM8-W8-S3
-    config_k2 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
-    grid_bmm = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
-    bmm_coalesced_kernel[grid_bmm](
-        left_final, right_final_t, bmm_out_tmp,
-        bs, s1, s2, H,
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        **config_k2, num_warps=8, num_stages=3
-    )
-    # --- Kernel 3: Fully Fused Final Stage ---
-    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
-    # Hardcoded best config from logs: M32-N128-K32-GM8-W4-S3
-    config_k3 = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
-    grid_final = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
-    fused_final_kernel[grid_final](
-        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
-        M, H, d, s1,
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        og_mh.stride(0), og_mh.stride(1), to_out_weight.stride(0), to_out_weight.stride(1),
-        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
-        LN_EPS=1e-5, **config_k3, num_warps=4, num_stages=3
-    )
-    return final_out
-def pack_w_4way_efficient(weights):
-    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
-    WL, WLG, WR, WRG = (weights[k] for k in ['left_proj.weight', 'left_gate.weight', 'right_proj.weight', 'right_gate.weight'])
-    H, K = WL.shape
-    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2).contiguous().view(4 * H, K)
-    return ws.t().to(torch.float16)
-def get_w_og(weights):
-    """ Gets the transposed [K, H] out_gate weight matrix. """
-    return weights['out_gate.weight'].t().to(torch.float16)
-@torch.compile()
-def compiledtrimul(
-    x: torch.Tensor, mask: torch.Tensor, norm_weight: torch.Tensor, norm_bias: torch.Tensor,
-    w_concat: torch.Tensor, to_out_norm_weight: torch.Tensor, to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor, h: int
-) -> torch.Tensor:
-    bs, s1, s2, d = x.shape
-    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
-    all_projections = torch.mm(x_norm, w_concat)
-    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
-    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
-    left = left * mask_expanded * torch.sigmoid(lg)
-    right = right * mask_expanded * torch.sigmoid(rg)
-    out_gate = torch.sigmoid(og)
-    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
-    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
-    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
-    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * out_gate
-    final_out_flat = gated @ to_out_weight.t()
-    return final_out_flat.view(bs, s1, s1, d)
-def small_kernel_pt_path(data):
-    input_tensor, mask, weights, config = data
-    w_concat = torch.cat([
-        weights['left_proj.weight'], weights['right_proj.weight'], weights['left_gate.weight'],
-        weights['right_gate.weight'], weights['out_gate.weight']
-    ], dim=0).t().contiguous().to(torch.float16)
-    return compiledtrimul(
-        x=input_tensor.to(torch.float32), mask=mask.unsqueeze(-1),
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32), w_concat=w_concat,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=config["hidden_dim"]
-    )
-def kernel_b200(data):
-    input_tensor, mask, weights, config = data
-    bs, s1, s2, d = input_tensor.shape
-    if s1 < 800:
-        return small_kernel_pt_path(data)
-    H = config["hidden_dim"]
-    W_4way = pack_w_4way_efficient(weights)
-    W_og = get_w_og(weights)
-    M = bs * s1 * s2
-    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16)
-    return compiledtrimul_fused_interleaved_final(
-        x=input_tensor.to(torch.float32),
-        mask_mh=mask_mh,
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        W_4way=W_4way,
-        W_og=W_og,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=H,
-    )

build/torch-rocm/triton_h100.py DELETED Viewed

@@ -1,509 +0,0 @@
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16},  num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=5),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=4, num_stages=5),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=2, num_stages=4),
-    ],
-    key=['M', 'N', 'K'],
-)
-@triton.jit
-def fused_ln_dual_matmul_kernel(
-    # Pointers (9)
-    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
-    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
-    # Metadata (5)
-    M, H, K, s1, s2,
-    # Strides (16)
-    stride_x_m, stride_x_k,
-    stride_w4_k, stride_w4_n,
-    stride_wog_k, stride_wog_n,
-    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
-    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
-    stride_og_m, stride_og_h,
-    stride_mask_m, stride_mask_h,
-    # Constexpr (from decorator and kwargs)
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
-):
-    # --- PID Mapping: Based on the LARGER 4*H problem ---
-    pid = tl.program_id(axis=0)
-    N_4way = 4 * H
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    # --- SHARED LayerNorm calculation (done only ONCE) ---
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    m_mask = offs_m < M
-    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
-    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        mean += tl.sum(x_chunk, axis=1)
-    mean /= K
-    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        x_centered = x_chunk - mean[:, None]
-        var += tl.sum(x_centered * x_centered, axis=1)
-    var /= K
-    rstd = 1.0 / tl.sqrt(var + LN_EPS)
-    # --- Matmul Loop 1: For the 4-Way Projections ---
-    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
-    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        k_block_start = k * BLOCK_SIZE_K;
-        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
-        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
-        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
-        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
-        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
-        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
-        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
-        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
-        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
-        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-        accumulator_4way += tl.dot(x_norm_tile, w_tile)
-        #Some threads should calclate out_gate
-        if pid_n * BLOCK_SIZE_N < H:
-            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
-            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
-            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
-            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-            accumulator_og += tl.dot(x_norm_tile, w_tile)
-    if pid_n * BLOCK_SIZE_N < H:
-        og_out = tl.sigmoid(accumulator_og)
-        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
-        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
-        tl.store(outg_ptrs, og_out, mask=og_mask)
-    # --- Fusion Logic for 4-Way Part ---
-    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
-    role_idx = tl.arange(0, 4)[None, None, :]
-    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
-    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
-    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
-    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
-    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
-    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
-    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
-    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
-    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
-    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
-    s1s2 = s1 * s2
-    offs_b  = offs_m // s1s2
-    offs_s1 = (offs_m % s1s2) // s2
-    offs_s2 = offs_m % s2
-    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
-    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
-    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
-    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
-    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
-                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
-    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
-                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1) # s2 offset uses s2 stride, s1 offset uses s1 stride
-    tl.store(outl_ptrs, left_out, mask=m_mask_h)
-    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-    ],
-    key=['s1', 's2', 'H'],
-)
-@triton.jit
-def bmm_coalesced_kernel(
-    # Pointers
-    Left_ptr, Right_ptr, Out_ptr,
-    # Dimensions
-    bs, s1, s2, H,
-    # Strides
-    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
-    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
-    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
-    # Kernel parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # Grid and program IDs
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    pid_bh = tl.program_id(axis=1)
-    pid_b = pid_bh // H
-    pid_h = pid_bh % H
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
-    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
-        k_start = k * BLOCK_SIZE_K
-        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
-        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
-        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
-        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-        accumulator += tl.dot(a, b)
-    # --- Coalesced Write ---
-    # Write to a standard (bs, H, s1, s1) layout
-    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
-               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
-    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
-    tl.store(out_ptrs, accumulator, mask=c_mask)
-@torch.compile
-def torch_pt2(left_final, right_final_t, bs, s1, s2, d, h, to_out_norm_weight, to_out_norm_bias, og_mh, to_out_weight):
-    bmm_out = torch.matmul(left_final, right_final_t)
-    out_einsum_flat = bmm_out.permute(0, 2, 3, 1).reshape(bs * s1 * s1, h)
-    # Apply layer norm and final gating
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * og_mh
-    # Final projection
-    final_out_flat = gated @ to_out_weight.t()
-    final_out = final_out_flat.view(bs, s1, s2, d)
-    return final_out
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
-    ],
-    key=['H', 'D'],
-)
-@triton.jit
-def fused_final_kernel(
-    # Pointers
-    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
-    # Metadata
-    M, H, D, s1, # M_gate = bs*s1*s2
-    # Strides
-    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
-    stride_gate_m, stride_gate_h,
-    stride_proj_d, stride_proj_h,
-    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
-    # Constants
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # --- Grid and PID Setup for Matmul ---
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    m_mask = offs_m < M
-    # Decompose M back to (b, r, c) for reordering lookups
-    s1s1 = s1 * s1
-    b = offs_m // s1s1
-    r = (offs_m % s1s1) // s1
-    c = offs_m % s1
-    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
-        # Accumulate sum and sum of squares in one pass
-        sum_x += tl.sum(in_chunk, axis=1)
-        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
-    # Finalize statistics
-    mean = sum_x / H
-    var = (sum_x2 / H) - (mean * mean)
-    rstd = tl.math.rsqrt(var + LN_EPS)
-    # --- Pass 3: Fused Gating and Matmul ---
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_norm = (a - mean[:, None]) * rstd[:, None]
-        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
-        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
-        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
-        proj_ptrs = ProjW_ptr + \
-                    offs_n[None, :] * stride_proj_d + \
-                    offs_k[:, None] * stride_proj_h
-        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
-        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_gated = a_norm * gate
-        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
-        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
-    # --- Store Final Output ---
-    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
-    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
-    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
-def compiledtrimul_fused_interleaved(
-    x: torch.Tensor,
-    mask_mh: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    W_4way: torch.Tensor, # Use the new weight matrices
-    W_og: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int,
-):
-    bs, s1, s2, d = x.shape
-    M, K, H = bs * s1 * s2, x.shape[-1], h
-    x_flat = x.view(M, K)
-    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
-    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
-    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
-    # The grid is launched for the larger 4*H problem
-    N_4way = 4 * H
-    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
-    fused_ln_dual_matmul_kernel[grid](
-        # Pointers (9)
-        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
-        left_final, right_final_t, og_mh,
-        # Metadata (5) - M, H, K, s1, s2
-        M, H, K, s1, s2,
-        # Strides (16)
-        x_flat.stride(0), x_flat.stride(1),
-        W_4way.stride(0), W_4way.stride(1),
-        W_og.stride(0), W_og.stride(1),
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        og_mh.stride(0), og_mh.stride(1),
-        mask_mh.stride(0), mask_mh.stride(1),
-        # Constexpr (1)
-        LN_EPS=1e-5
-    )
-    return torch_pt2(
-        left_final, right_final_t,
-        bs=bs,
-        s1=s1,
-        s2=s2,
-        d=d,
-        h=h,
-        to_out_norm_weight=to_out_norm_weight,
-        to_out_norm_bias=to_out_norm_bias,
-        og_mh=og_mh,
-        to_out_weight=to_out_weight
-    )
-def pack_w_4way_efficient(weights):
-    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
-    WL = weights['left_proj.weight']
-    WLG = weights['left_gate.weight']
-    WR = weights['right_proj.weight']
-    WRG = weights['right_gate.weight']
-    H, K = WL.shape
-    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2)
-    ws = ws.contiguous().view(4 * H, K)
-    return ws.t().to(torch.float16)
-def get_w_og(weights):
-    """ Gets the transposed [K, H] out_gate weight matrix. """
-    WOG = weights['out_gate.weight']
-    return WOG.t().to(torch.float16)
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@torch.compile
-def compiledtrimul(
-    x: torch.Tensor,
-    mask: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    w_concat: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int
-) -> torch.Tensor:
-    """
-    A barebones, compiled PyTorch function for the TriMul logic.
-    """
-    bs, s1, s2, d = x.shape
-    # Initial LayerNorm
-    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
-    # Single large matmul: [M, d] @ [d, 5h] = [M, 5h]
-    all_projections = torch.mm(x_norm, w_concat)
-    # Split back into individual projections
-    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
-    # Apply mask and gates
-    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
-    left = left * mask_expanded * torch.sigmoid(lg)
-    right = right * mask_expanded * torch.sigmoid(rg)
-    out_gate = torch.sigmoid(og)
-    # Reshape for einsum
-    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
-    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
-    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
-    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
-    # Apply layer norm and final gating
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * out_gate
-    # Final projection
-    final_out_flat = gated @ to_out_weight.t()
-    final_out = final_out_flat.view(bs, s1, s2, d)
-    return final_out
-def small_kernel_pt_path(data):
-    input_tensor, mask, weights, config = data
-    w_concat = torch.cat([
-        weights['left_proj.weight'],
-        weights['right_proj.weight'],
-        weights['left_gate.weight'],
-        weights['right_gate.weight'],
-        weights['out_gate.weight']
-    ], dim=0).t().contiguous().to(torch.float16)
-    # Call the compiled function with prepared weights
-    output = compiledtrimul(
-        x=input_tensor.to(torch.float32),
-        mask=mask.unsqueeze(-1),
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        w_concat=w_concat,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float32),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float32),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=config["hidden_dim"]
-    )
-    return output
-def kernel_h100(data):
-    input_tensor, mask, weights, config = data
-    bs, s1, s2, d = input_tensor.shape
-    if s1 <= 512:
-        return small_kernel_pt_path(data)
-    H = config["hidden_dim"]
-    W_4way = pack_w_4way_efficient(weights)
-    W_og = get_w_og(weights)
-    M = bs * s1 * s2
-    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16) #move into kernel possibly
-    return compiledtrimul_fused_interleaved(
-        x=input_tensor.to(torch.float32),
-        mask_mh=mask_mh,
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        W_4way=W_4way, # Pass the new 4-way matrix
-        W_og=W_og,     # Pass the new out_gate matrix
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=H,
-    )

build/torch-xpu/__init__.py DELETED Viewed

@@ -1,7 +0,0 @@
-from .triton_a100 import kernel_a100
-from .triton_h100 import kernel_h100
-from .triton_b200 import kernel_b200
-from .trimul_mi300 import kernel_mi300
-from .trimul_global import kernel_global
-__all__ = ["kernel_a100", "kernel_h100", "kernel_b200", "kernel_mi300", "kernel_global"]

build/torch-xpu/_ops.py DELETED Viewed

@@ -1,8 +0,0 @@
-import torch
-ops = torch.ops._trimul_gpumode_8e6e60d
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_trimul_gpumode_8e6e60d::{op_name}"

build/torch-xpu/metadata.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"python-depends":[]}

build/torch-xpu/task.py DELETED Viewed

@@ -1,20 +0,0 @@
-"""
-Type definitions for TriMul task.
-Input: Tuple of (input_tensor, mask, weights, config)
-  - input_tensor: Input tensor of shape [batch_size, seq_len, seq_len, dim]
-  - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
-  - weights: Dictionary containing model weights
-  - config: Dictionary containing model configuration parameters
-Output: Output tensor of shape [batch_size, seq_len, seq_len, dim]
-"""
-import torch
-from typing import Tuple, Dict, Any
-# Input type: (input_tensor, mask, weights, config)
-input_t = Tuple[torch.Tensor, torch.Tensor, Dict[str, torch.Tensor], Dict[str, Any]]
-# Output type: output tensor
-output_t = torch.Tensor

build/torch-xpu/trimul_global.py DELETED Viewed

@@ -1,971 +0,0 @@
-# from utils import make_match_reference, DisableCuDNNTF32
-from .task import input_t, output_t
-import torch
-from torch import nn, einsum
-import math
-import os
-import requests
-import triton
-import triton.language as tl
-# The flag below controls whether to allow TF32 on matmul. This flag defaults to False
-# in PyTorch 1.12 and later.
-torch.backends.cuda.matmul.allow_tf32 = True
-# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
-torch.backends.cudnn.allow_tf32 = True
-# Set allocator for TMA descriptors (required for on-device TMA)
-def alloc_fn(size: int, alignment: int, stream=None):
-    return torch.empty(size, device="cuda", dtype=torch.int8)
-triton.set_allocator(alloc_fn)
-# os.environ['TRITON_PRINT_AUTOTUNING'] = '1'
-# os.environ['MLIR_ENABLE_DIAGNOSTICS'] = 'warnings,remarks'
-# Reference code in PyTorch
-class TriMul(nn.Module):
-    # Based on https://github.com/lucidrains/triangle-multiplicative-module/blob/main/triangle_multiplicative_module/triangle_multiplicative_module.py
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-    ):
-        super().__init__()
-        self.norm = nn.LayerNorm(dim)
-        self.left_proj = nn.Linear(dim, hidden_dim, bias=False)
-        self.right_proj = nn.Linear(dim, hidden_dim, bias=False)
-        self.left_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.right_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.out_gate = nn.Linear(dim, hidden_dim, bias=False)
-        self.to_out_norm = nn.LayerNorm(hidden_dim)
-        self.to_out = nn.Linear(hidden_dim, dim, bias=False)
-    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        """
-        x: [bs, seq_len, seq_len, dim]
-        mask: [bs, seq_len, seq_len]
-        Returns:
-            output: [bs, seq_len, seq_len, dim]
-        """
-        batch_size, seq_len, _, dim = x.shape
-        x = self.norm(x)
-        left = self.left_proj(x)
-        right = self.right_proj(x)
-        mask = mask.unsqueeze(-1)
-        left = left * mask
-        right = right * mask
-        left_gate = self.left_gate(x).sigmoid()
-        right_gate = self.right_gate(x).sigmoid()
-        out_gate = self.out_gate(x).sigmoid()
-        left = left * left_gate
-        right = right * right_gate
-        out = einsum('... i k d, ... j k d -> ... i j d', left, right)
-        # This einsum is the same as the following:
-        # out = torch.zeros(batch_size, seq_len, seq_len, dim, device=x.device)
-        # # Compute using nested loops
-        # for b in range(batch_size):
-        #     for i in range(seq_len):
-        #         for j in range(seq_len):
-        #             # Compute each output element
-        #             for k in range(seq_len):
-        #                 out[b, i, j] += left[b, i, k, :] * right[b, j, k, :]
-        out = self.to_out_norm(out)
-        out = out * out_gate
-        return self.to_out(out)
-@triton.jit
-def triton_sigmoid(x):
-    """
-    Compute sigmoid function: 1 / (1 + exp(-x))
-    """
-    return 1.0 / (1.0 + tl.exp(-x))
-def two_mm_kernel_configs_wrapper():
-    if torch.cuda.get_device_capability() == (12, 0):
-        def two_mm_kernel_configs():
-            configs = []
-            for BLOCK_M in [16, 32]:
-                for BLOCK_N in [16, 32, 64]:
-                    for BLOCK_K in [16, 32, 64]:
-                        for num_stages in [2, 3]:
-                            configs.append(triton.Config({
-                                'BLOCK_M': BLOCK_M,
-                                'BLOCK_N': BLOCK_N,
-                                'BLOCK_K': BLOCK_K,
-                                'GROUP_SIZE_M': 8
-                            }, num_stages=num_stages, num_warps=8))
-            return configs
-    elif torch.cuda.get_device_capability()[0] == 9:
-        def get_optimal_two_mm_config_h100(B, seq_len, dim):
-            configs = {
-                (1, 128, 128): (128, 64, 128, 2, 8),
-                (1, 128, 256): (128, 64, 128, 2, 8),
-                (1, 128, 384): (128, 64, 64, 3, 8),
-                (1, 128, 512): (128, 64, 64, 3, 8),
-                (1, 128, 768): (128, 64, 64, 3, 8),
-                (1, 128, 1024): (128, 64, 64, 3, 8),
-                (1, 256, 128): (128, 64, 128, 2, 8),
-                (1, 256, 256): (128, 64, 128, 2, 8),
-                (1, 256, 384): (128, 64, 64, 3, 8),
-                (1, 256, 512): (128, 64, 64, 3, 8),
-                (1, 256, 768): (128, 64, 64, 3, 8),
-                (1, 256, 1024): (128, 64, 64, 3, 8),
-                (1, 512, 128): (128, 64, 128, 2, 8),
-                (1, 512, 256): (128, 64, 128, 2, 8),
-                (1, 512, 384): (128, 64, 128, 2, 8),
-                (1, 512, 512): (128, 64, 128, 2, 8),
-                (1, 512, 768): (128, 64, 64, 3, 8),
-                (1, 512, 1024): (128, 64, 64, 3, 8),
-                (1, 1024, 128): (128, 64, 128, 2, 8),
-                (1, 1024, 256): (128, 64, 64, 2, 8),
-                (1, 1024, 384): (128, 64, 128, 2, 8),
-                (1, 1024, 512): (128, 64, 128, 2, 8),
-                (1, 1024, 768): (128, 64, 128, 2, 8),
-                (1, 1024, 1024): (128, 64, 128, 2, 8),
-                (2, 128, 128): (128, 64, 128, 2, 8),
-                (2, 128, 256): (128, 64, 128, 2, 8),
-                (2, 128, 384): (128, 64, 64, 3, 8),
-                (2, 128, 512): (128, 64, 64, 3, 8),
-                (2, 128, 768): (128, 64, 64, 3, 8),
-                (2, 128, 1024): (128, 64, 64, 3, 8),
-                (2, 256, 128): (128, 64, 128, 2, 8),
-                (2, 256, 256): (128, 64, 128, 2, 8),
-                (2, 256, 384): (128, 64, 128, 2, 8),
-                (2, 256, 512): (128, 64, 128, 2, 8),
-                (2, 256, 768): (128, 64, 64, 3, 8),
-                (2, 256, 1024): (128, 64, 64, 3, 8),
-                (2, 512, 128): (128, 64, 128, 2, 8),
-                (2, 512, 256): (128, 64, 128, 2, 8),
-                (2, 512, 384): (128, 64, 128, 2, 8),
-                (2, 512, 512): (128, 64, 128, 2, 8),
-                (2, 512, 768): (128, 64, 128, 2, 8),
-                (2, 512, 1024): (128, 64, 128, 2, 8),
-                (2, 1024, 128): (128, 64, 128, 2, 8),
-                (2, 1024, 256): (128, 64, 128, 2, 8),
-                (2, 1024, 384): (128, 64, 128, 2, 8),
-                (2, 1024, 512): (128, 64, 128, 2, 8),
-                (2, 1024, 768): (128, 64, 128, 2, 8),
-                (2, 1024, 1024): (128, 64, 128, 2, 8),
-            }
-            return configs.get((B, seq_len, dim), (64, 64, 32, 2, 8))  # default fallback
-        def two_mm_kernel_configs():
-            # This function is kept for compatibility but will be overridden for H100
-            return [
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-            ]
-    elif torch.cuda.get_device_capability()[0] == 10 and False:
-        def get_optimal_two_mm_config(B, seq_len, dim):
-            configs = {
-                (1, 128, 128): (64, 128, 64, 2, 8),
-                (1, 128, 256): (128, 64, 128, 2, 8),
-                (1, 128, 384): (128, 64, 128, 2, 8),
-                (1, 128, 512): (128, 64, 128, 2, 8),
-                (1, 128, 768): (128, 64, 64, 3, 8),
-                (1, 128, 1024): (128, 64, 64, 3, 8),
-                (1, 256, 128): (128, 64, 128, 2, 8),
-                (1, 256, 256): (128, 64, 128, 2, 8),
-                (1, 256, 384): (128, 64, 128, 2, 8),
-                (1, 256, 512): (128, 64, 64, 3, 8),
-                (1, 256, 768): (128, 64, 64, 3, 8),
-                (1, 256, 1024): (128, 64, 64, 3, 8),
-                (1, 512, 128): (128, 64, 128, 2, 8),
-                (1, 512, 256): (128, 64, 128, 2, 8),
-                (1, 512, 384): (128, 64, 128, 2, 8),
-                (1, 512, 512): (128, 64, 128, 2, 8),
-                (1, 512, 768): (128, 64, 64, 3, 8),
-                (1, 512, 1024): (128, 64, 64, 3, 8),
-                (1, 1024, 128): (128, 64, 128, 2, 8),
-                (1, 1024, 256): (128, 64, 128, 2, 8),
-                (1, 1024, 384): (128, 64, 128, 2, 8),
-                (1, 1024, 512): (128, 64, 128, 2, 8),
-                (1, 1024, 768): (128, 64, 64, 3, 8),
-                (1, 1024, 1024): (128, 64, 64, 3, 8),
-                (2, 128, 128): (128, 64, 128, 2, 8),
-                (2, 128, 256): (128, 64, 128, 2, 8),
-                (2, 128, 384): (128, 64, 128, 2, 8),
-                (2, 128, 512): (128, 64, 64, 3, 8),
-                (2, 128, 768): (128, 64, 64, 3, 8),
-                (2, 128, 1024): (128, 64, 64, 3, 8),
-                (2, 256, 128): (128, 64, 128, 2, 8),
-                (2, 256, 256): (128, 64, 128, 2, 8),
-                (2, 256, 384): (128, 64, 128, 2, 8),
-                (2, 256, 512): (128, 64, 64, 3, 8),
-                (2, 256, 768): (128, 64, 64, 3, 8),
-                (2, 256, 1024): (128, 64, 64, 3, 8),
-                (2, 512, 128): (128, 64, 128, 2, 8),
-                (2, 512, 256): (128, 64, 128, 2, 8),
-                (2, 512, 384): (128, 64, 128, 2, 8),
-                (2, 512, 512): (128, 64, 128, 2, 8),
-                (2, 512, 768): (128, 64, 64, 3, 8),
-                (2, 512, 1024): (128, 64, 64, 3, 8),
-                (2, 1024, 128): (128, 64, 128, 2, 8),
-                (2, 1024, 256): (128, 64, 128, 2, 8),
-                (2, 1024, 384): (128, 64, 128, 2, 8),
-                (2, 1024, 512): (128, 64, 128, 2, 8),
-                (2, 1024, 768): (128, 64, 64, 3, 8),
-                (2, 1024, 1024): (128, 64, 64, 3, 8),
-            }
-            return configs.get((B, seq_len, dim), (64, 64, 32, 2, 8))  # default fallback
-        def two_mm_kernel_configs():
-            # This function is kept for compatibility but will be overridden
-            return [
-                triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 128, 'GROUP_SIZE_M': 8}, num_stages=2, num_warps=8),
-                triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-            ]
-    elif torch.cuda.get_device_capability()[0] == 8:
-        # A100
-        def two_mm_kernel_configs():
-            configs = []
-            for BLOCK_M in [64]:
-                for BLOCK_N in [64, 128]:
-                    for BLOCK_K in [16]:
-                        for num_stages in [3, 4]:
-                            for num_warps in [4, 8]:
-                                configs.append(triton.Config({
-                                    'BLOCK_M': BLOCK_M,
-                                    'BLOCK_N': BLOCK_N,
-                                    'BLOCK_K': BLOCK_K,
-                                    'GROUP_SIZE_M': 8
-                                }, num_stages=num_stages, num_warps=num_warps))
-            return configs
-    else:
-        def two_mm_kernel_configs():
-            configs = []
-            for BLOCK_M in [64, 128]:
-                for BLOCK_N in [64, 128]:
-                    for BLOCK_K in [64, 128]:
-                        for num_stages in [2, 3]:
-                            configs.append(triton.Config({
-                                'BLOCK_M': BLOCK_M,
-                                'BLOCK_N': BLOCK_N,
-                                'BLOCK_K': BLOCK_K,
-                                'GROUP_SIZE_M': 8
-                            }, num_stages=num_stages, num_warps=8))
-            return configs
-    return two_mm_kernel_configs
-def two_mm_kernel_wrapper():
-    if torch.cuda.get_device_capability()[0] == 8:
-        @triton.jit
-        def two_mm_kernel(a_ptr, b1_ptr, b2_ptr, b3_ptr, b4_ptr, b5_ptr, c1_ptr, c2_ptr, d_ptr, mask_ptr, M, N, K, stride_a0, stride_a1, stride_a2, stride_a3, stride_bk, stride_bn, stride_c0, stride_c1, stride_c2, stride_c3, seq_len, stride_d0, stride_d1, stride_d2, stride_d3, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
-            # Persistent kernel using standard tl.load operations
-            start_pid = tl.program_id(axis=0)
-            num_pid_m = tl.cdiv(M, BLOCK_M)
-            num_pid_n = tl.cdiv(N, BLOCK_N)
-            k_tiles = tl.cdiv(K, BLOCK_K)
-            num_tiles = num_pid_m * num_pid_n
-            # tile_id_c is used in the epilogue to break the dependency between
-            # the prologue and the epilogue
-            tile_id_c = start_pid - NUM_SMS
-            num_pid_in_group = GROUP_SIZE_M * num_pid_n
-            # Persistent loop over tiles
-            for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=False):
-                # Calculate PID for this tile using improved swizzling
-                group_id = tile_id // num_pid_in_group
-                first_pid_m = group_id * GROUP_SIZE_M
-                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-                pid_m = first_pid_m + (tile_id % group_size_m)
-                pid_n = (tile_id % num_pid_in_group) // group_size_m
-                # Calculate block offsets
-                offs_am = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-                offs_bn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-                offs_k = tl.arange(0, BLOCK_K)
-                # Initialize accumulators for all outputs
-                accumulator1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator3 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator4 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator_d = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                # Main computation loop over K dimension
-                for ki in range(k_tiles):
-                    k_start = ki * BLOCK_K
-                    k_offsets = k_start + offs_k
-                    # Create pointers for A matrix (2D flattened view)
-                    a_ptrs = a_ptr + offs_am[:, None] * stride_a2 + k_offsets[None, :] * stride_a3
-                    a_mask = (offs_am[:, None] < M) & (k_offsets[None, :] < K)
-                    # Create pointers for B matrices [N, K] layout
-                    b1_ptrs = b1_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b2_ptrs = b2_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b3_ptrs = b3_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b4_ptrs = b4_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b5_ptrs = b5_ptr + offs_bn[:, None] * stride_bn + k_offsets[None, :] * stride_bk
-                    b_mask = (offs_bn[:, None] < N) & (k_offsets[None, :] < K)
-                    # Load blocks from A and all weight matrices using standard tl.load
-                    a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-                    b1 = tl.load(b1_ptrs, mask=b_mask, other=0.0)
-                    b2 = tl.load(b2_ptrs, mask=b_mask, other=0.0)
-                    b3 = tl.load(b3_ptrs, mask=b_mask, other=0.0)
-                    b4 = tl.load(b4_ptrs, mask=b_mask, other=0.0)
-                    b5 = tl.load(b5_ptrs, mask=b_mask, other=0.0)
-                    # Perform matrix multiplications using TF32
-                    accumulator1 = tl.dot(a, b1.T, accumulator1, allow_tf32=True)  # A @ B1.T
-                    accumulator2 = tl.dot(a, b2.T, accumulator2, allow_tf32=True)  # A @ B2.T
-                    accumulator3 = tl.dot(a, b3.T, accumulator3, allow_tf32=True)  # A @ B3.T
-                    accumulator4 = tl.dot(a, b4.T, accumulator4, allow_tf32=True)  # A @ B4.T
-                    accumulator_d = tl.dot(a, b5.T, accumulator_d, allow_tf32=True)  # A @ B5.T
-                # Store results using separate tile_id_c for epilogue
-                tile_id_c += NUM_SMS
-                group_id = tile_id_c // num_pid_in_group
-                first_pid_m = group_id * GROUP_SIZE_M
-                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-                pid_m = first_pid_m + (tile_id_c % group_size_m)
-                pid_n = (tile_id_c % num_pid_in_group) // group_size_m
-                # Calculate output offsets and pointers
-                offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-                offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-                # Create masks for bounds checking
-                d_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-                # Calculate pointer addresses using 4D strides
-                stride_cm = stride_c2  # Stride to next element in flattened M dimension
-                stride_cn = stride_c3  # N is the innermost dimension
-                # For D tensor: use separate D strides
-                stride_dm = stride_d2  # Stride to next element in flattened M dimension
-                stride_dn = stride_d3  # N is the innermost dimension
-                off_c_batch = offs_cm // (seq_len * seq_len)
-                off_c_sl1 = (offs_cm // seq_len) % seq_len
-                off_c_sl2 = offs_cm % seq_len
-                off_c_dim = offs_cn
-                c_offsets = (off_c_batch * stride_c0 + off_c_sl1 * stride_c1 + off_c_sl2 * stride_c2)[:, None] + off_c_dim[None, :] * stride_c3
-                c_mask = d_mask
-                c1_ptrs = c1_ptr + c_offsets
-                c2_ptrs = c2_ptr + c_offsets
-                d_ptrs = d_ptr + stride_dm * offs_cm[:, None] + stride_dn * offs_cn[None, :]
-                mask = tl.load(mask_ptr + offs_cm, mask=(offs_cm < M))
-                # Broadcast mask to match accumulator dimensions [BLOCK_M, BLOCK_N]
-                mask_2d = mask[:, None]  # Convert to [BLOCK_M, 1] then broadcast
-                # Apply masking only to left_proj and right_proj results (C1, C2)
-                accumulator1 = tl.where(mask_2d, accumulator1, 0)
-                accumulator2 = tl.where(mask_2d, accumulator2, 0)
-                # Apply sigmoid to gate values
-                left_gate_sigmoid = triton_sigmoid(accumulator3)
-                right_gate_sigmoid = triton_sigmoid(accumulator4)
-                accumulator_d = triton_sigmoid(accumulator_d)
-                # Apply elementwise multiplication with gated values
-                # C1 = left * left_gate, C2 = right * right_gate
-                accumulator1 = accumulator1 * left_gate_sigmoid  # left * left_gate
-                accumulator2 = accumulator2 * right_gate_sigmoid  # right * right_gate
-                # Convert to appropriate output dtype and store with normal tl.store
-                c1 = accumulator1.to(c1_ptr.dtype.element_ty)
-                c2 = accumulator2.to(c2_ptr.dtype.element_ty)
-                d = accumulator_d.to(d_ptr.dtype.element_ty)
-                tl.store(c1_ptrs, c1, mask=c_mask)
-                tl.store(c2_ptrs, c2, mask=c_mask)
-                tl.store(d_ptrs, d, mask=d_mask)
-    else:
-        @triton.jit
-        def two_mm_kernel(a_ptr, b1_ptr, b2_ptr, b3_ptr, b4_ptr, b5_ptr, c1_ptr, c2_ptr, d_ptr, mask_ptr, M, N, K, stride_a0, stride_a1, stride_a2, stride_a3, stride_bk, stride_bn, stride_c0, stride_c1, stride_c2, stride_c3, seq_len, stride_d0, stride_d1, stride_d2, stride_d3, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr, NUM_SMS: tl.constexpr):
-            # Persistent kernel using on-device TMA descriptors
-            start_pid = tl.program_id(axis=0)
-            num_pid_m = tl.cdiv(M, BLOCK_M)
-            num_pid_n = tl.cdiv(N, BLOCK_N)
-            k_tiles = tl.cdiv(K, BLOCK_K)
-            num_tiles = num_pid_m * num_pid_n
-            # Create on-device TMA descriptors
-            a_desc = tl._experimental_make_tensor_descriptor(
-                a_ptr,
-                shape=[M, K],
-                strides=[stride_a2, stride_a3],
-                block_shape=[BLOCK_M, BLOCK_K],
-            )
-            b1_desc = tl._experimental_make_tensor_descriptor(
-                b1_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            b2_desc = tl._experimental_make_tensor_descriptor(
-                b2_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            b3_desc = tl._experimental_make_tensor_descriptor(
-                b3_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            b4_desc = tl._experimental_make_tensor_descriptor(
-                b4_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            b5_desc = tl._experimental_make_tensor_descriptor(
-                b5_ptr,
-                shape=[N, K],
-                strides=[stride_bn, stride_bk],
-                block_shape=[BLOCK_N, BLOCK_K],
-            )
-            # tile_id_c is used in the epilogue to break the dependency between
-            # the prologue and the epilogue
-            tile_id_c = start_pid - NUM_SMS
-            num_pid_in_group = GROUP_SIZE_M * num_pid_n
-            # Persistent loop over tiles
-            for tile_id in tl.range(start_pid, num_tiles, NUM_SMS, flatten=False):
-                # Calculate PID for this tile using improved swizzling
-                group_id = tile_id // num_pid_in_group
-                first_pid_m = group_id * GROUP_SIZE_M
-                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-                pid_m = first_pid_m + (tile_id % group_size_m)
-                pid_n = (tile_id % num_pid_in_group) // group_size_m
-                # Calculate block offsets
-                offs_am = pid_m * BLOCK_M
-                offs_bn = pid_n * BLOCK_N
-                # Initialize accumulators for all outputs
-                accumulator1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator3 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator4 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                accumulator_d = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-                # Main computation loop over K dimension
-                for ki in range(k_tiles):
-                    offs_k = ki * BLOCK_K
-                    # Load blocks from A and all weight matrices using on-device TMA
-                    a = a_desc.load([offs_am, offs_k])
-                    b1 = b1_desc.load([offs_bn, offs_k])
-                    b2 = b2_desc.load([offs_bn, offs_k])
-                    b3 = b3_desc.load([offs_bn, offs_k])
-                    b4 = b4_desc.load([offs_bn, offs_k])
-                    b5 = b5_desc.load([offs_bn, offs_k])
-                    # Perform matrix multiplications using TF32
-                    accumulator1 = tl.dot(a, b1.T, accumulator1, allow_tf32=True)  # A @ B1.T
-                    accumulator2 = tl.dot(a, b2.T, accumulator2, allow_tf32=True)  # A @ B2.T
-                    accumulator3 = tl.dot(a, b3.T, accumulator3, allow_tf32=True)  # A @ B3.T
-                    accumulator4 = tl.dot(a, b4.T, accumulator4, allow_tf32=True)  # A @ B4.T
-                    accumulator_d = tl.dot(a, b5.T, accumulator_d, allow_tf32=True)  # A @ B5.T
-                # Store results using separate tile_id_c for epilogue
-                tile_id_c += NUM_SMS
-                group_id = tile_id_c // num_pid_in_group
-                first_pid_m = group_id * GROUP_SIZE_M
-                group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-                pid_m = first_pid_m + (tile_id_c % group_size_m)
-                pid_n = (tile_id_c % num_pid_in_group) // group_size_m
-                # Calculate output offsets and pointers
-                offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-                offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-                # Create masks for bounds checking
-                d_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-                # Calculate pointer addresses using 4D strides
-                # For C tensors: compute effective 2D strides from 4D strides
-                # Output tensor is [B, I, J, N], flattened to [M, N] where M = B*I*J
-                stride_cm = stride_c2  # Stride to next element in flattened M dimension
-                stride_cn = stride_c3  # N is the innermost dimension
-                # For D tensor: use separate D strides
-                stride_dm = stride_d2  # Stride to next element in flattened M dimension
-                stride_dn = stride_d3  # N is the innermost dimension
-                off_c_batch = offs_cm // (seq_len * seq_len)
-                off_c_sl1 = (offs_cm // seq_len) % seq_len
-                off_c_sl2 = offs_cm % seq_len
-                off_c_dim = offs_cn
-                # TODO update the mask_c so we don't IMA
-                c_offsets = (off_c_batch * stride_c0 + off_c_sl1 * stride_c1 + off_c_sl2 * stride_c2)[:, None] + off_c_dim[None, :] * stride_c3
-                # c_offsets = offs_cm[:, None] * stride_c2 + offs_cn[None, :] * stride_c3
-                c_mask = d_mask
-                c1_ptrs = c1_ptr + c_offsets
-                c2_ptrs = c2_ptr + c_offsets
-                # c1_ptrs = c1_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-                # c2_ptrs = c2_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-                d_ptrs = d_ptr + stride_dm * offs_cm[:, None] + stride_dn * offs_cn[None, :]
-                mask = tl.load(mask_ptr + offs_cm, mask=(offs_cm < M))
-                # Broadcast mask to match accumulator dimensions [BLOCK_M, BLOCK_N]
-                mask_2d = mask[:, None]  # Convert to [BLOCK_M, 1] then broadcast
-                # Apply masking only to left_proj and right_proj results (C1, C2)
-                accumulator1 = tl.where(mask_2d, accumulator1, 0)
-                accumulator2 = tl.where(mask_2d, accumulator2, 0)
-                # Apply sigmoid to gate values
-                left_gate_sigmoid = triton_sigmoid(accumulator3)
-                right_gate_sigmoid = triton_sigmoid(accumulator4)
-                accumulator_d = triton_sigmoid(accumulator_d)
-                # Apply elementwise multiplication with gated values
-                # C1 = left * left_gate, C2 = right * right_gate
-                accumulator1 = accumulator1 * left_gate_sigmoid  # left * left_gate
-                accumulator2 = accumulator2 * right_gate_sigmoid  # right * right_gate
-                # Convert to appropriate output dtype and store with normal tl.store
-                c1 = accumulator1.to(c1_ptr.dtype.element_ty)
-                c2 = accumulator2.to(c2_ptr.dtype.element_ty)
-                d = accumulator_d.to(d_ptr.dtype.element_ty)
-                tl.store(c1_ptrs, c1, mask=c_mask)
-                tl.store(c2_ptrs, c2, mask=c_mask)
-                tl.store(d_ptrs, d, mask=d_mask)
-    if torch.cuda.get_device_capability()[0] not in [9, 10.2]:
-        two_mm_kernel = triton.autotune(
-            (two_mm_kernel_configs_wrapper())(), key=["M", "N", "K"]
-        )(two_mm_kernel)
-    return two_mm_kernel
-def two_mm(A, left_proj, right_proj, left_gate, right_gate, out_gate, mask):
-    """
-    Persistent matrix multiplication for all weight matrices using on-device TMA descriptors.
-    Args:
-        A: [..., K] tensor (arbitrary leading dimensions)
-        left_proj: [N, K] matrix (will be transposed)
-        right_proj: [N, K] matrix (will be transposed)
-        left_gate: [N, K] left gate weight matrix
-        right_gate: [N, K] right gate weight matrix
-        out_gate: [N, K] output gate weight matrix
-        mask: mask tensor
-    Returns:
-        (C1, C2, D): Tuple of result tensors [..., N] with same leading dims as A
-            C1 = (A @ left_proj.T) * sigmoid(A @ left_gate.T) (masked)
-            C2 = (A @ right_proj.T) * sigmoid(A @ right_gate.T) (masked)
-            D = sigmoid(A @ out_gate.T) (unmasked)
-    """
-    # Check constraints
-    assert A.shape[-1] == left_proj.shape[1] == right_proj.shape[1], "Incompatible K dimensions"
-    assert A.dtype == left_proj.dtype == right_proj.dtype, "Incompatible dtypes"
-    # Assert that all weight matrices have the same strides (same [N, K] shape)
-    assert left_proj.stride() == right_proj.stride() == left_gate.stride() == right_gate.stride() == out_gate.stride(), \
-        "All weight matrices must have identical strides"
-    # Get dimensions
-    original_shape = A.shape[:-1]  # All dimensions except the last
-    K = A.shape[-1]
-    N = left_proj.shape[0]
-    B, seq_len, _, _ = A.shape
-    dtype = A.dtype
-    # Flatten A to 2D for kernel processing
-    A_2d = A.view(-1, K)  # [M, K] where M is product of all leading dims
-    M = A_2d.shape[0]
-    # Get number of streaming multiprocessors
-    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
-    # Launch persistent kernel with limited number of blocks
-    grid = lambda META: (min(NUM_SMS, triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"])),)
-    # Get original 4D strides for A and output tensors
-    A_strides = A.stride()  # (stride_0, stride_1, stride_2, stride_3)
-    # Create output tensors with proper 4D shape to get correct strides
-    output_shape = original_shape + (N,)
-    # C1 = torch.empty(output_shape, device=A.device, dtype=dtype)
-    # C2 = torch.empty(output_shape, device=A.device, dtype=dtype)
-    C1 = torch.empty(B, N, seq_len, seq_len, device=A.device, dtype=torch.float16).permute(0, 2, 3, 1)
-    C2 = torch.empty(B, N, seq_len, seq_len, device=A.device, dtype=torch.float16).permute(0, 2, 3, 1)
-    D = torch.empty(output_shape, device=A.device, dtype=torch.float16)
-    C_strides = C1.stride()  # (stride_0, stride_1, stride_2, stride_3)
-    D_strides = D.stride()   # (stride_0, stride_1, stride_2, stride_3)
-    # Use optimal configuration for B200/H100 or fallback to autotuning for other GPUs
-    if torch.cuda.get_device_capability()[0] == 10:
-        # Get optimal configuration for B200
-        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps = (two_mm_kernel_configs_wrapper())(B, seq_len, K)
-        grid_size = min(NUM_SMS, triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N))
-        two_mm_kernel_wrapper()[(grid_size,)](
-            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
-            C1, C2, D, mask,
-            M, N, K,
-            *A_strides,  # 4D strides for A
-            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
-            *C_strides,  # 4D strides for C
-            seq_len,
-            *D_strides,  # 4D strides for D
-            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_SIZE_M=8, NUM_SMS=NUM_SMS,
-            num_stages=num_stages, num_warps=num_warps
-        )
-    elif torch.cuda.get_device_capability()[0] == 9:
-        # Get optimal configuration for H100
-        BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps = (two_mm_kernel_configs_wrapper())(B, seq_len, K)
-        grid_size = min(NUM_SMS, triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N))
-        two_mm_kernel_wrapper()[(grid_size,)](
-            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
-            C1, C2, D, mask,
-            M, N, K,
-            *A_strides,  # 4D strides for A
-            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
-            *C_strides,  # 4D strides for C
-            seq_len,
-            *D_strides,  # 4D strides for D
-            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K, GROUP_SIZE_M=8, NUM_SMS=NUM_SMS,
-            num_stages=num_stages, num_warps=num_warps
-        )
-    else:
-        # Use autotuning for other GPUs
-        two_mm_kernel_wrapper()[grid](
-            A_2d, left_proj, right_proj, left_gate, right_gate, out_gate,
-            C1, C2, D, mask,
-            M, N, K,
-            *A_strides,  # 4D strides for A
-            left_proj.stride(1), left_proj.stride(0),  # B matrices [N, K] shape strides
-            *C_strides,  # 4D strides for C
-            seq_len,
-            *D_strides,  # 4D strides for D
-            NUM_SMS=NUM_SMS
-        )
-    return C1, C2, D
-def second_layernorm_mul(inp, hidden_dim, weight, bias, mul_operand):
-    ln = torch.nn.functional.layer_norm(inp, (hidden_dim,), eps=1e-5, weight=weight.to(inp.dtype), bias=bias.to(inp.dtype))
-    out = ln * mul_operand
-    return out
-'''
-@triton.autotune(
-    [triton.Config({"ROW_BLOCK_SIZE": 16}, num_warps=4, num_stages=3)],
-    key=["R", "C"]
-)
-'''
-@triton.jit
-def layernorm_kernel_first(
-    X,
-    Y,
-    Weight,
-    Bias,
-    R,
-    C,  # aka "dim"
-    eps,
-    ROW_BLOCK_SIZE: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    row = tl.program_id(0) * ROW_BLOCK_SIZE + tl.arange(0, ROW_BLOCK_SIZE)
-    cols = tl.arange(0, BLOCK_SIZE)
-    mask_row = row < R
-    mask_col = cols < C
-    # Simple indexing for contiguous data
-    x = tl.load(
-        X + row[:, None] * C + cols[None, :],
-        mask=mask_row[:, None] & mask_col[None, :],
-        other=0.0
-    ).to(tl.float32)
-    weight = tl.load(Weight + cols, mask=mask_col, other=0.0).to(tl.float32)
-    bias = tl.load(Bias + cols, mask=mask_col, other=0.0).to(tl.float32)
-    mean = tl.sum(x, axis=1) / C
-    diff = tl.where(mask_row[:, None] & mask_col[None, :], x - mean[:, None], 0)
-    var = tl.sum(diff * diff, axis=1) / C
-    rstd = 1 / tl.sqrt(var + eps)
-    y_hat = (x - mean[:, None]) * rstd[:, None]
-    y = y_hat * weight[None, :] + bias[None, :]
-    tl.store(
-        Y + row[:, None] * C + cols[None, :],
-        y,
-        mask=mask_row[:, None] & mask_col[None, :]
-    )
-def get_optimal_config_ln(dim):
-    config = None
-    if torch.cuda.get_device_capability()[0] == 9:
-        if (dim <= 256):
-            config = (16, 1)
-        elif dim <= 512:
-            config = (16, 2)
-        elif dim <= 1024:
-            config = (16, 4)
-    if not config:
-        config = (16, 4)
-    return config
-def triton_layernorm_first(x, weight, bias, eps=1e-5, num_warps=None, ROW_BLOCK_SIZE=None):
-    B, seq_len, seq_len2, dim = x.shape
-    assert(seq_len == seq_len2)
-    R = B * seq_len * seq_len
-    C = dim
-    out = torch.empty_like(x, dtype=torch.float16)
-    if not num_warps or not ROW_BLOCK_SIZE:
-        ROW_BLOCK_SIZE, num_warps = get_optimal_config_ln(dim)
-    BLOCK_SIZE = triton.next_power_of_2(C)
-    assert(BLOCK_SIZE <= 1024)
-    def grid(meta):
-        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
-    layernorm_kernel_first[grid](
-        x, out, weight, bias,
-        R, C, eps,
-        ROW_BLOCK_SIZE=ROW_BLOCK_SIZE,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
-        num_stages=3
-    )
-    return out
-'''
-def triton_layernorm_first(x, weight, bias, eps=1e-5):
-    B, seq_len, seq_len2, dim = x.shape
-    assert(seq_len == seq_len2)
-    R = B * seq_len * seq_len
-    C = dim
-    out = torch.empty_like(x)
-    BLOCK_SIZE = triton.next_power_of_2(C)
-    assert(BLOCK_SIZE <= 1024)
-    def grid(meta):
-        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
-    layernorm_kernel_first[grid](
-        x, out, weight, bias,
-        R, C, eps,
-        BLOCK_SIZE=BLOCK_SIZE
-    )
-    return out
-'''
-@triton.autotune(
-    [triton.Config({"ROW_BLOCK_SIZE": 16}, num_warps=1, num_stages=3)],
-    key=[]
-)
-@triton.jit
-def layernorm_kernel_eltwise(
-    X,
-    Y,
-    Weight,
-    Bias,
-    OutGate,
-    seq_len,
-    stride_batch,
-    stride_dim,
-    R,
-    C,  # aka "dim"
-    eps,
-    ROW_BLOCK_SIZE: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    row = tl.program_id(0) * ROW_BLOCK_SIZE + tl.arange(0, ROW_BLOCK_SIZE)
-    cols = tl.arange(0, BLOCK_SIZE)
-    # Calculate base pointer for this batch of rows
-    tl.device_assert(seq_len*seq_len % ROW_BLOCK_SIZE == 0)
-    # batch_offset = (row // (stride_seq1 // stride_dim)) * stride_batch
-    batch = tl.program_id(0) * ROW_BLOCK_SIZE // (seq_len * seq_len)
-    seqs_off = row % (seq_len * seq_len) # TODO is this going to prevent vectorization
-    off_r = batch * stride_batch + seqs_off
-    off_c = cols * stride_dim
-    mask_row = row < R
-    mask_col = cols < C
-    out_gate = tl.load(
-        OutGate + row[:, None] * C + cols[None, :],
-        mask = mask_row[:, None] & mask_col[None, :],
-    )
-    x = tl.load(
-        X + off_r[:, None] + off_c[None, :],
-        mask=mask_row[:, None] & mask_col[None, :],
-        other=0.0
-    ).to(tl.float32)
-    weight = tl.load(Weight + cols, mask=mask_col, other=0.0).to(tl.float32)
-    bias = tl.load(Bias + cols, mask=mask_col, other=0.0).to(tl.float32)
-    mean = tl.sum(x, axis=1) / C
-    diff = tl.where(mask_row[:, None] & mask_col[None, :], x - mean[:, None], 0)
-    var = tl.sum(diff * diff, axis=1) / C
-    rstd = 1 / tl.sqrt(var + eps)
-    y_hat = (x - mean[:, None]) * rstd[:, None]
-    y = y_hat * weight[None, :] + bias[None, :]
-    tl.store(
-        Y + row[:, None] * C + cols[None, :],
-        y * out_gate,
-        mask=mask_row[:, None] & mask_col[None, :]
-    )
-def triton_layernorm_eltwise(x, weight, bias, out_gate, eps=1e-5):
-    B, seq_len, seq_len2, dim = x.shape
-    assert(seq_len == seq_len2)
-    R = B * seq_len * seq_len
-    assert(x.stride(3) == seq_len*seq_len)
-    assert(out_gate.is_contiguous())
-    C = dim
-    out = torch.empty_like(out_gate, dtype=torch.float32)
-    BLOCK_SIZE = triton.next_power_of_2(C)
-    assert(BLOCK_SIZE == 128)
-    def grid(meta):
-        return (triton.cdiv(R, meta["ROW_BLOCK_SIZE"]),)
-    layernorm_kernel_eltwise[grid](
-        x, out, weight, bias, out_gate,
-        seq_len,
-        x.stride(0), x.stride(3),
-        R, C, eps,
-        BLOCK_SIZE=BLOCK_SIZE
-    )
-    return out
-def kernel_global(data: input_t) -> output_t:
-    """
-    Reference implementation of TriMul using PyTorch.
-    Args:
-        data: Tuple of (input: torch.Tensor, mask: torch.Tensor, weights: Dict[str, torch.Tensor], config: Dict)
-            - input: Input tensor of shape [batch_size, seq_len, seq_len, dim]
-            - mask: Mask tensor of shape [batch_size, seq_len, seq_len]
-            - weights: Dictionary containing model weights
-            - config: Dictionary containing model configuration parameters
-    """
-    input_tensor, mask, weights, config = data
-    left_proj_weight = weights["left_proj.weight"].to(torch.float16)
-    right_proj_weight = weights["right_proj.weight"].to(torch.float16)
-    left_gate_weight = weights["left_gate.weight"].to(torch.float16)
-    right_gate_weight = weights["right_gate.weight"].to(torch.float16)
-    out_gate_weight = weights["out_gate.weight"].to(torch.float16)
-    hidden_dim = config["hidden_dim"]
-    # trimul = TriMul(dim=config["dim"], hidden_dim=config["hidden_dim"]).to(input_tensor.device)
-    x = input_tensor
-    batch_size, seq_len, _, dim = x.shape
-    x = triton_layernorm_first(x, weights['norm.weight'], weights['norm.bias'])
-    # x = torch.nn.functional.layer_norm(x, (dim,), eps=1e-5, weight=weights['norm.weight'], bias=weights['norm.bias'])
-    left, right, out_gate = two_mm(x, left_proj_weight, right_proj_weight, left_gate_weight, right_gate_weight, out_gate_weight, mask)
-    # left = torch.nn.functional.linear(x, weights['left_proj.weight'].to(torch.float16))
-    # right = torch.nn.functional.linear(x, weights['right_proj.weight'].to(torch.float16))
-    # left = left * mask.unsqueeze(-1)
-    # right = right * mask.unsqueeze(-1)
-    '''
-    left = left.to(torch.float32)
-    right = right.to(torch.float32)
-    x = x.to(torch.float32)
-    left_gate = left_gate.sigmoid()
-    right_gate = right_gate.sigmoid()
-    out_gate = out_gate.sigmoid()
-    '''
-    # Elementwise multiplication now handled in kernel
-    # left = left * left_gate
-    # right = right * right_gate
-    # out = einsum('... i k d, ... j k d -> ... i j d', left, right)
-    out = torch.bmm(left.permute(0, 3, 1, 2).view(-1, left.shape[1], left.shape[2]), right.permute(0, 3, 2, 1).view(-1, right.shape[2], right.shape[1]))
-    out = out.view(batch_size, hidden_dim, seq_len, seq_len).permute(0, 2, 3, 1)
-    # out = torch.compile(second_layernorm_mul, dynamic=False)(out, hidden_dim, weights['to_out_norm.weight'], weights['to_out_norm.bias'], out_gate)
-    out = triton_layernorm_eltwise(out, weights['to_out_norm.weight'], weights['to_out_norm.bias'], out_gate)
-    # out = torch.nn.functional.layer_norm(out, (hidden_dim,), eps=1e-5, weight=weights['to_out_norm.weight'].to(out.dtype), bias=weights['to_out_norm.bias'].to(out.dtype))
-    # out = out * out_gate
-    return torch.nn.functional.linear(out, weights['to_out.weight'])
-    '''
-    # Fill in the given weights of the model
-    trimul.norm.weight = nn.Parameter(weights['norm.weight'])
-    trimul.norm.bias = nn.Parameter(weights['norm.bias'])
-    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'])
-    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'])
-    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'])
-    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'])
-    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'])
-    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'])
-    trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'])
-    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'])
-    output = trimul(input_tensor, mask)
-    return output
-    '''

build/torch-xpu/trimul_gpumode/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-import ctypes
-import sys
-import importlib
-from pathlib import Path
-from types import ModuleType
-def _import_from_path(file_path: Path) -> ModuleType:
-    # We cannot use the module name as-is, after adding it to `sys.modules`,
-    # it would also be used for other imports. So, we make a module name that
-    # depends on the path for it to be unique using the hex-encoded hash of
-    # the path.
-    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
-    module_name = path_hash
-    spec = importlib.util.spec_from_file_location(module_name, file_path)
-    if spec is None:
-        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
-    module = importlib.util.module_from_spec(spec)
-    if module is None:
-        raise ImportError(f"Cannot load module {module_name} from spec")
-    sys.modules[module_name] = module
-    spec.loader.exec_module(module)  # type: ignore
-    return module
-globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch-xpu/trimul_mi300.py DELETED Viewed

@@ -1,524 +0,0 @@
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16},  num_warps=4, num_stages=2),
-        # Configurations with larger block sizes for better data reuse
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=2),
-        # Configurations with deeper K dimension
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=2),
-        # More extreme configurations to test the limits
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=4, num_stages=2),
-        # Configurations with fewer warps
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=2, num_stages=2),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=3),
-    ],
-    key=['M', 'N', 'K'],
-)
-@triton.jit
-def fused_ln_dual_matmul_kernel(
-    # Pointers (9)
-    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
-    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
-    # Metadata (5)
-    M, H, K, s1, s2,
-    # Strides (16)
-    stride_x_m, stride_x_k,
-    stride_w4_k, stride_w4_n,
-    stride_wog_k, stride_wog_n,
-    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
-    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
-    stride_og_m, stride_og_h,
-    stride_mask_m, stride_mask_h,
-    # Constexpr (from decorator and kwargs)
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
-):
-    # --- PID Mapping: Based on the LARGER 4*H problem ---
-    pid = tl.program_id(axis=0)
-    N_4way = 4 * H
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    # --- SHARED LayerNorm calculation (done only ONCE) ---
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    m_mask = offs_m < M
-    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
-    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        mean += tl.sum(x_chunk, axis=1)
-    mean /= K
-    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        x_centered = x_chunk - mean[:, None]
-        var += tl.sum(x_centered * x_centered, axis=1)
-    var /= K
-    rstd = 1.0 / tl.sqrt(var + LN_EPS)
-    # --- Matmul Loop 1: For the 4-Way Projections ---
-    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
-    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        k_block_start = k * BLOCK_SIZE_K;
-        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
-        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
-        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
-        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
-        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
-        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
-        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
-        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
-        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
-        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-        accumulator_4way += tl.dot(x_norm_tile, w_tile)
-        #Some threads should calclate out_gate
-        if pid_n * BLOCK_SIZE_N < H:
-            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
-            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
-            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
-            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-            accumulator_og += tl.dot(x_norm_tile, w_tile)
-    if pid_n * BLOCK_SIZE_N < H:
-        og_out = tl.sigmoid(accumulator_og)
-        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
-        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
-        tl.store(outg_ptrs, og_out, mask=og_mask)
-    # --- Fusion Logic for 4-Way Part ---
-    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
-    role_idx = tl.arange(0, 4)[None, None, :]
-    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
-    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
-    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
-    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
-    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
-    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
-    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
-    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
-    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
-    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
-    s1s2 = s1 * s2
-    offs_b  = offs_m // s1s2
-    offs_s1 = (offs_m % s1s2) // s2
-    offs_s2 = offs_m % s2
-    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
-    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
-    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
-    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
-    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
-                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
-    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
-                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1) # s2 offset uses s2 stride, s1 offset uses s1 stride
-    tl.store(outl_ptrs, left_out, mask=m_mask_h)
-    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
-    ],
-    key=['s1', 's2', 'H'],
-)
-@triton.jit
-def bmm_coalesced_kernel(
-    # Pointers
-    Left_ptr, Right_ptr, Out_ptr,
-    # Dimensions
-    bs, s1, s2, H,
-    # Strides
-    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
-    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
-    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
-    # Kernel parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # Grid and program IDs
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    pid_bh = tl.program_id(axis=1)
-    pid_b = pid_bh // H
-    pid_h = pid_bh % H
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
-    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
-        k_start = k * BLOCK_SIZE_K
-        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
-        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
-        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
-        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-        accumulator += tl.dot(a, b)
-    # --- Coalesced Write ---
-    # Write to a standard (bs, H, s1, s1) layout
-    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
-               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
-    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
-    tl.store(out_ptrs, accumulator, mask=c_mask)
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
-    ],
-    key=['H', 'D'],
-)
-@triton.jit
-def fused_final_kernel(
-    # Pointers
-    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
-    # Metadata
-    M, H, D, s1, # M_gate = bs*s1*s2
-    # Strides
-    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
-    stride_gate_m, stride_gate_h,
-    stride_proj_d, stride_proj_h,
-    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
-    # Constants
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # --- Grid and PID Setup for Matmul ---
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    m_mask = offs_m < M
-    # Decompose M back to (b, r, c) for reordering lookups
-    s1s1 = s1 * s1
-    b = offs_m // s1s1
-    r = (offs_m % s1s1) // s1
-    c = offs_m % s1
-    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
-        # Accumulate sum and sum of squares in one pass
-        sum_x += tl.sum(in_chunk, axis=1)
-        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
-    # Finalize statistics
-    mean = sum_x / H
-    var = (sum_x2 / H) - (mean * mean)
-    rstd = tl.math.rsqrt(var + LN_EPS)
-    # --- Pass 3: Fused Gating and Matmul ---
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_norm = (a - mean[:, None]) * rstd[:, None]
-        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
-        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
-        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
-        proj_ptrs = ProjW_ptr + \
-                    offs_n[None, :] * stride_proj_d + \
-                    offs_k[:, None] * stride_proj_h
-        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
-        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_gated = a_norm * gate
-        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
-        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
-    # --- Store Final Output ---
-    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
-    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
-    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
-def compiledtrimul_fused_interleaved(
-    x: torch.Tensor,
-    mask_mh: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    W_4way: torch.Tensor, # Use the new weight matrices
-    W_og: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int,
-):
-    bs, s1, s2, d = x.shape
-    M, K, H = bs * s1 * s2, x.shape[-1], h
-    x_flat = x.view(M, K)
-    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
-    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
-    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
-    # The grid is launched for the larger 4*H problem
-    N_4way = 4 * H
-    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
-    fused_ln_dual_matmul_kernel[grid](
-        # Pointers (9)
-        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
-        left_final, right_final_t, og_mh,
-        # Metadata (5) - M, H, K, s1, s2
-        M, H, K, s1, s2,
-        # Strides (16)
-        x_flat.stride(0), x_flat.stride(1),
-        W_4way.stride(0), W_4way.stride(1),
-        W_og.stride(0), W_og.stride(1),
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        og_mh.stride(0), og_mh.stride(1),
-        mask_mh.stride(0), mask_mh.stride(1),
-        # Constexpr (1)
-        LN_EPS=1e-5
-    )
-    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
-    grid_bmm = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
-    bmm_coalesced_kernel[grid_bmm](
-        left_final, right_final_t, bmm_out_tmp,
-        bs, s1, s2, H,
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-    )
-    # --- Kernel 3: Fully Fused Final Stage ---
-    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
-    grid_final = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
-    fused_final_kernel[grid_final](
-        # Pointers
-        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
-        # Metadata
-        M, H, d, s1,
-        # Strides
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        og_mh.stride(0), og_mh.stride(1),
-        to_out_weight.stride(0), to_out_weight.stride(1), # Use strides of the corrected tensor
-        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
-        # Constants
-        LN_EPS=1e-5,
-    )
-    return final_out
-def pack_w_4way_efficient(weights):
-    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
-    WL = weights['left_proj.weight']
-    WLG = weights['left_gate.weight']
-    WR = weights['right_proj.weight']
-    WRG = weights['right_gate.weight']
-    H, K = WL.shape
-    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2)
-    ws = ws.contiguous().view(4 * H, K)
-    return ws.t().to(torch.float16)
-def get_w_og(weights):
-    """ Gets the transposed [K, H] out_gate weight matrix. """
-    WOG = weights['out_gate.weight']
-    return WOG.t().to(torch.float16)
-def compiledtrimul(
-    x: torch.Tensor,
-    mask: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    w_concat: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int
-) -> torch.Tensor:
-    """
-    A barebones, compiled PyTorch function for the TriMul logic.
-    """
-    bs, s1, s2, d = x.shape
-    # Initial LayerNorm
-    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
-    # Single large matmul: [M, d] @ [d, 5h] = [M, 5h]
-    all_projections = torch.mm(x_norm, w_concat)
-    # Split back into individual projections
-    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
-    # Apply mask and gates
-    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
-    left = left * mask_expanded * torch.sigmoid(lg)
-    right = right * mask_expanded * torch.sigmoid(rg)
-    out_gate = torch.sigmoid(og)
-    # Reshape for einsum
-    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
-    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
-    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
-    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
-    # Apply layer norm and final gating
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * out_gate
-    # Final projection
-    final_out_flat = gated @ to_out_weight.t()
-    final_out = final_out_flat.view(bs, s1, s2, d)
-    return final_out
-def small_kernel_pt_path(data):
-    input_tensor, mask, weights, config = data
-    w_concat = torch.cat([
-        weights['left_proj.weight'],
-        weights['right_proj.weight'],
-        weights['left_gate.weight'],
-        weights['right_gate.weight'],
-        weights['out_gate.weight']
-    ], dim=0).t().contiguous().to(torch.float16)
-    # Call the compiled function with prepared weights
-    output = compiledtrimul(
-        x=input_tensor.to(torch.float32),
-        mask=mask.unsqueeze(-1),
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        w_concat=w_concat,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=config["hidden_dim"]
-    )
-    return output
-def kernel_mi300(data):
-    input_tensor, mask, weights, config = data
-    bs, s1, s2, d = input_tensor.shape
-    if s1 < 100:
-        return small_kernel_pt_path(data)
-    H = config["hidden_dim"]
-    W_4way = pack_w_4way_efficient(weights)
-    W_og = get_w_og(weights)
-    M = bs * s1 * s2
-    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16) #move into kernel possibly
-    return compiledtrimul_fused_interleaved(
-        x=input_tensor.to(torch.float32),
-        mask_mh=mask_mh,
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        W_4way=W_4way, # Pass the new 4-way matrix
-        W_og=W_og,     # Pass the new out_gate matrix
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=H,
-    )

build/torch-xpu/triton_a100.py DELETED Viewed

@@ -1,405 +0,0 @@
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-# Set PyTorch flags for performance
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@triton.jit
-def fused_ln_dual_matmul_kernel(
-    # Pointers (9)
-    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
-    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
-    # Metadata (5)
-    M, H, K, s1, s2,
-    # Strides (16)
-    stride_x_m, stride_x_k,
-    stride_w4_k, stride_w4_n,
-    stride_wog_k, stride_wog_n,
-    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
-    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
-    stride_og_m, stride_og_h,
-    stride_mask_m, stride_mask_h,
-    # Constexpr (now passed as arguments from the host)
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
-):
-    # --- PID Mapping: Based on the LARGER 4*H problem ---
-    pid = tl.program_id(axis=0)
-    N_4way = 4 * H
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    # --- SHARED LayerNorm calculation (done only ONCE) ---
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    m_mask = offs_m < M
-    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
-    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        mean += tl.sum(x_chunk, axis=1)
-    mean /= K
-    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        x_centered = x_chunk - mean[:, None]
-        var += tl.sum(x_centered * x_centered, axis=1)
-    var /= K
-    rstd = 1.0 / tl.sqrt(var + LN_EPS)
-    # --- Matmul Loop 1: For the 4-Way Projections ---
-    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
-    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        k_block_start = k * BLOCK_SIZE_K;
-        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
-        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
-        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
-        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
-        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
-        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
-        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
-        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
-        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
-        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-        accumulator_4way += tl.dot(x_norm_tile, w_tile)
-        if pid_n * BLOCK_SIZE_N < H:
-            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
-            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
-            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
-            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-            accumulator_og += tl.dot(x_norm_tile, w_tile)
-    if pid_n * BLOCK_SIZE_N < H:
-        og_out = tl.sigmoid(accumulator_og)
-        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
-        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
-        tl.store(outg_ptrs, og_out, mask=og_mask)
-    # --- Fusion Logic for 4-Way Part ---
-    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
-    role_idx = tl.arange(0, 4)[None, None, :]
-    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
-    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
-    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
-    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
-    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
-    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
-    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
-    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
-    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
-    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
-    s1s2 = s1 * s2
-    offs_b  = offs_m // s1s2
-    offs_s1 = (offs_m % s1s2) // s2
-    offs_s2 = offs_m % s2
-    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
-    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
-    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
-    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
-    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
-                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
-    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
-                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1)
-    tl.store(outl_ptrs, left_out, mask=m_mask_h)
-    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
-@triton.jit
-def bmm_coalesced_kernel(
-    # Pointers
-    Left_ptr, Right_ptr, Out_ptr,
-    # Dimensions
-    bs, s1, s2, H,
-    # Strides
-    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
-    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
-    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
-    # Kernel parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    pid_bh = tl.program_id(axis=1)
-    pid_b = pid_bh // H
-    pid_h = pid_bh % H
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
-    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
-        k_start = k * BLOCK_SIZE_K
-        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
-        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
-        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
-        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-        accumulator += tl.dot(a, b)
-    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
-               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
-    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
-    tl.store(out_ptrs, accumulator, mask=c_mask)
-@triton.jit
-def fused_final_kernel(
-    # Pointers
-    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
-    # Metadata
-    M, H, D, s1,
-    # Strides
-    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
-    stride_gate_m, stride_gate_h,
-    stride_proj_d, stride_proj_h,
-    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
-    # Constants
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    m_mask = offs_m < M
-    s1s1 = s1 * s1
-    b = offs_m // s1s1
-    r = (offs_m % s1s1) // s1
-    c = offs_m % s1
-    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
-        sum_x += tl.sum(in_chunk, axis=1)
-        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
-    mean = sum_x / H
-    var = (sum_x2 / H) - (mean * mean)
-    rstd = tl.math.rsqrt(var + LN_EPS)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_norm = (a - mean[:, None]) * rstd[:, None]
-        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
-        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
-        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
-        proj_ptrs = ProjW_ptr + offs_n[None, :] * stride_proj_d + offs_k[:, None] * stride_proj_h
-        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
-        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_gated = a_norm * gate
-        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
-        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
-    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
-    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
-    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
-def compiledtrimul_fused_interleaved_final(
-    x: torch.Tensor,
-    mask_mh: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    W_4way: torch.Tensor,
-    W_og: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int,
-):
-    bs, s1, s2, d = x.shape
-    M, K, H = bs * s1 * s2, x.shape[-1], h
-    x_flat = x.view(M, K)
-    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
-    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
-    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
-    # --- Kernel 1: Fused LN + Dual Matmul ---
-    N_4way = 4 * H
-    # Hardcoded A100 best config: M128-N128-K32-GM8-HC32-W8-S2
-    config_k1 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}
-    grid_k1 = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
-    fused_ln_dual_matmul_kernel[grid_k1](
-        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
-        left_final, right_final_t, og_mh,
-        M, H, K, s1, s2,
-        x_flat.stride(0), x_flat.stride(1), W_4way.stride(0), W_4way.stride(1),
-        W_og.stride(0), W_og.stride(1), left_final.stride(0), left_final.stride(1),
-        left_final.stride(2), left_final.stride(3), right_final_t.stride(0), right_final_t.stride(1),
-        right_final_t.stride(2), right_final_t.stride(3), og_mh.stride(0), og_mh.stride(1),
-        mask_mh.stride(0), mask_mh.stride(1),
-        LN_EPS=1e-5, **config_k1, num_warps=8, num_stages=2
-    )
-    # --- Kernel 2: Batched Matrix Multiplication ---
-    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
-    # Hardcoded A100 best config: M128-N64-K32-GM8-W4-S3
-    config_k2 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
-    grid_k2 = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
-    bmm_coalesced_kernel[grid_k2](
-        left_final, right_final_t, bmm_out_tmp,
-        bs, s1, s2, H,
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        **config_k2, num_warps=4, num_stages=3
-    )
-    # --- Kernel 3: Fully Fused Final Stage ---
-    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
-    # Hardcoded A100 best config: M32-N128-K32-GM8-W4-S3
-    config_k3 = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
-    grid_k3 = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
-    fused_final_kernel[grid_k3](
-        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
-        M, H, d, s1,
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        og_mh.stride(0), og_mh.stride(1), to_out_weight.stride(0), to_out_weight.stride(1),
-        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
-        LN_EPS=1e-5, **config_k3, num_warps=4, num_stages=3
-    )
-    return final_out
-def pack_w_4way_efficient(weights):
-    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
-    WL, WLG, WR, WRG = (weights[k] for k in ['left_proj.weight', 'left_gate.weight', 'right_proj.weight', 'right_gate.weight'])
-    H, K = WL.shape
-    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2).contiguous().view(4 * H, K)
-    return ws.t().to(torch.float16)
-def get_w_og(weights):
-    """ Gets the transposed [K, H] out_gate weight matrix. """
-    return weights['out_gate.weight'].t().to(torch.float16)
-@torch.compile()
-def compiledtrimul(
-    x: torch.Tensor, mask: torch.Tensor, norm_weight: torch.Tensor, norm_bias: torch.Tensor,
-    w_concat: torch.Tensor, to_out_norm_weight: torch.Tensor, to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor, h: int
-) -> torch.Tensor:
-    bs, s1, s2, d = x.shape
-    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
-    all_projections = torch.mm(x_norm, w_concat)
-    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
-    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
-    left = left * mask_expanded * torch.sigmoid(lg)
-    right = right * mask_expanded * torch.sigmoid(rg)
-    out_gate = torch.sigmoid(og)
-    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
-    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
-    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
-    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * out_gate
-    final_out_flat = gated @ to_out_weight.t()
-    return final_out_flat.view(bs, s1, s1, d)
-def small_kernel_pt_path(data):
-    input_tensor, mask, weights, config = data
-    w_concat = torch.cat([
-        weights['left_proj.weight'], weights['right_proj.weight'], weights['left_gate.weight'],
-        weights['right_gate.weight'], weights['out_gate.weight']
-    ], dim=0).t().contiguous().to(torch.float16)
-    return compiledtrimul(
-        x=input_tensor.to(torch.float32), mask=mask.unsqueeze(-1),
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32), w_concat=w_concat,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=config["hidden_dim"]
-    )
-def kernel_a100(data):
-    input_tensor, mask, weights, config = data
-    bs, s1, s2, d = input_tensor.shape
-    if s1 < 512: # Adjusted threshold based on observed BMM configs
-        return small_kernel_pt_path(data)
-    H = config["hidden_dim"]
-    W_4way = pack_w_4way_efficient(weights)
-    W_og = get_w_og(weights)
-    M = bs * s1 * s2
-    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16)
-    return compiledtrimul_fused_interleaved_final(
-        x=input_tensor.to(torch.float32),
-        mask_mh=mask_mh,
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        W_4way=W_4way,
-        W_og=W_og,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=H,
-    )

build/torch-xpu/triton_b200.py DELETED Viewed

@@ -1,411 +0,0 @@
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@triton.jit
-def fused_ln_dual_matmul_kernel(
-    # Pointers (9)
-    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
-    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
-    # Metadata (5)
-    M, H, K, s1, s2,
-    # Strides (16)
-    stride_x_m, stride_x_k,
-    stride_w4_k, stride_w4_n,
-    stride_wog_k, stride_wog_n,
-    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
-    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
-    stride_og_m, stride_og_h,
-    stride_mask_m, stride_mask_h,
-    # Constexpr (now passed as arguments from the host)
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
-):
-    # --- PID Mapping: Based on the LARGER 4*H problem ---
-    pid = tl.program_id(axis=0)
-    N_4way = 4 * H
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    # --- SHARED LayerNorm calculation (done only ONCE) ---
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    m_mask = offs_m < M
-    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
-    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        mean += tl.sum(x_chunk, axis=1)
-    mean /= K
-    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        x_centered = x_chunk - mean[:, None]
-        var += tl.sum(x_centered * x_centered, axis=1)
-    var /= K
-    rstd = 1.0 / tl.sqrt(var + LN_EPS)
-    # --- Matmul Loop 1: For the 4-Way Projections ---
-    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
-    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        k_block_start = k * BLOCK_SIZE_K;
-        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
-        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
-        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
-        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
-        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
-        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
-        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
-        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
-        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
-        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-        accumulator_4way += tl.dot(x_norm_tile, w_tile)
-        #Some threads should calclate out_gate
-        if pid_n * BLOCK_SIZE_N < H:
-            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
-            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
-            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
-            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-            accumulator_og += tl.dot(x_norm_tile, w_tile)
-    if pid_n * BLOCK_SIZE_N < H:
-        og_out = tl.sigmoid(accumulator_og)
-        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
-        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
-        tl.store(outg_ptrs, og_out, mask=og_mask)
-    # --- Fusion Logic for 4-Way Part ---
-    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
-    role_idx = tl.arange(0, 4)[None, None, :]
-    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
-    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
-    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
-    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
-    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
-    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
-    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
-    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
-    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
-    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
-    s1s2 = s1 * s2
-    offs_b  = offs_m // s1s2
-    offs_s1 = (offs_m % s1s2) // s2
-    offs_s2 = offs_m % s2
-    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
-    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
-    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
-    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
-    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
-                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
-    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
-                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1)
-    tl.store(outl_ptrs, left_out, mask=m_mask_h)
-    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
-@triton.jit
-def bmm_coalesced_kernel(
-    # Pointers
-    Left_ptr, Right_ptr, Out_ptr,
-    # Dimensions
-    bs, s1, s2, H,
-    # Strides
-    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
-    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
-    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
-    # Kernel parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # Grid and program IDs
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    pid_bh = tl.program_id(axis=1)
-    pid_b = pid_bh // H
-    pid_h = pid_bh % H
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
-    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
-        k_start = k * BLOCK_SIZE_K
-        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
-        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
-        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
-        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-        accumulator += tl.dot(a, b)
-    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
-               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
-    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
-    tl.store(out_ptrs, accumulator, mask=c_mask)
-@triton.jit
-def fused_final_kernel(
-    # Pointers
-    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
-    # Metadata
-    M, H, D, s1,
-    # Strides
-    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
-    stride_gate_m, stride_gate_h,
-    stride_proj_d, stride_proj_h,
-    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
-    # Constants
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    m_mask = offs_m < M
-    s1s1 = s1 * s1
-    b = offs_m // s1s1
-    r = (offs_m % s1s1) // s1
-    c = offs_m % s1
-    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
-        sum_x += tl.sum(in_chunk, axis=1)
-        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
-    mean = sum_x / H
-    var = (sum_x2 / H) - (mean * mean)
-    rstd = tl.math.rsqrt(var + LN_EPS)
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_norm = (a - mean[:, None]) * rstd[:, None]
-        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
-        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
-        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
-        proj_ptrs = ProjW_ptr + offs_n[None, :] * stride_proj_d + offs_k[:, None] * stride_proj_h
-        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
-        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_gated = a_norm * gate
-        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
-        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
-    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
-    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
-    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
-def compiledtrimul_fused_interleaved_final(
-    x: torch.Tensor,
-    mask_mh: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    W_4way: torch.Tensor,
-    W_og: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int,
-):
-    bs, s1, s2, d = x.shape
-    M, K, H = bs * s1 * s2, x.shape[-1], h
-    x_flat = x.view(M, K)
-    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
-    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
-    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
-    # --- Kernel 1: Fused LN + Dual Matmul ---
-    # The grid is launched for the larger 4*H problem
-    N_4way = 4 * H
-    # Hardcoded best config from logs: M64-N128-K64-GM8-HC32-W4-S2
-    config_k1 = {'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}
-    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
-    fused_ln_dual_matmul_kernel[grid](
-        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
-        left_final, right_final_t, og_mh,
-        M, H, K, s1, s2,
-        x_flat.stride(0), x_flat.stride(1), W_4way.stride(0), W_4way.stride(1),
-        W_og.stride(0), W_og.stride(1), left_final.stride(0), left_final.stride(1),
-        left_final.stride(2), left_final.stride(3), right_final_t.stride(0), right_final_t.stride(1),
-        right_final_t.stride(2), right_final_t.stride(3), og_mh.stride(0), og_mh.stride(1),
-        mask_mh.stride(0), mask_mh.stride(1),
-        LN_EPS=1e-5, **config_k1, num_warps=4, num_stages=2
-    )
-    # --- Kernel 2: Batched Matrix Multiplication ---
-    bmm_out_tmp = torch.empty((bs, H, s1, s1), device=x.device, dtype=torch.float16)
-    # Hardcoded best config from logs: M128-N128-K32-GM8-W8-S3
-    config_k2 = {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
-    grid_bmm = lambda meta: (triton.cdiv(s1, meta['BLOCK_SIZE_M']) * triton.cdiv(s1, meta['BLOCK_SIZE_N']), bs * H)
-    bmm_coalesced_kernel[grid_bmm](
-        left_final, right_final_t, bmm_out_tmp,
-        bs, s1, s2, H,
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        **config_k2, num_warps=8, num_stages=3
-    )
-    # --- Kernel 3: Fully Fused Final Stage ---
-    final_out = torch.empty((bs, s1, s1, d), device=x.device, dtype=torch.float16)
-    # Hardcoded best config from logs: M32-N128-K32-GM8-W4-S3
-    config_k3 = {'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}
-    grid_final = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(d, meta['BLOCK_SIZE_N']),)
-    fused_final_kernel[grid_final](
-        bmm_out_tmp, og_mh, to_out_norm_weight, to_out_norm_bias, to_out_weight, final_out,
-        M, H, d, s1,
-        bmm_out_tmp.stride(0), bmm_out_tmp.stride(1), bmm_out_tmp.stride(2), bmm_out_tmp.stride(3),
-        og_mh.stride(0), og_mh.stride(1), to_out_weight.stride(0), to_out_weight.stride(1),
-        final_out.stride(0), final_out.stride(1), final_out.stride(2), final_out.stride(3),
-        LN_EPS=1e-5, **config_k3, num_warps=4, num_stages=3
-    )
-    return final_out
-def pack_w_4way_efficient(weights):
-    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
-    WL, WLG, WR, WRG = (weights[k] for k in ['left_proj.weight', 'left_gate.weight', 'right_proj.weight', 'right_gate.weight'])
-    H, K = WL.shape
-    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2).contiguous().view(4 * H, K)
-    return ws.t().to(torch.float16)
-def get_w_og(weights):
-    """ Gets the transposed [K, H] out_gate weight matrix. """
-    return weights['out_gate.weight'].t().to(torch.float16)
-@torch.compile()
-def compiledtrimul(
-    x: torch.Tensor, mask: torch.Tensor, norm_weight: torch.Tensor, norm_bias: torch.Tensor,
-    w_concat: torch.Tensor, to_out_norm_weight: torch.Tensor, to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor, h: int
-) -> torch.Tensor:
-    bs, s1, s2, d = x.shape
-    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
-    all_projections = torch.mm(x_norm, w_concat)
-    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
-    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
-    left = left * mask_expanded * torch.sigmoid(lg)
-    right = right * mask_expanded * torch.sigmoid(rg)
-    out_gate = torch.sigmoid(og)
-    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
-    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
-    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
-    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * out_gate
-    final_out_flat = gated @ to_out_weight.t()
-    return final_out_flat.view(bs, s1, s1, d)
-def small_kernel_pt_path(data):
-    input_tensor, mask, weights, config = data
-    w_concat = torch.cat([
-        weights['left_proj.weight'], weights['right_proj.weight'], weights['left_gate.weight'],
-        weights['right_gate.weight'], weights['out_gate.weight']
-    ], dim=0).t().contiguous().to(torch.float16)
-    return compiledtrimul(
-        x=input_tensor.to(torch.float32), mask=mask.unsqueeze(-1),
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32), w_concat=w_concat,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=config["hidden_dim"]
-    )
-def kernel_b200(data):
-    input_tensor, mask, weights, config = data
-    bs, s1, s2, d = input_tensor.shape
-    if s1 < 800:
-        return small_kernel_pt_path(data)
-    H = config["hidden_dim"]
-    W_4way = pack_w_4way_efficient(weights)
-    W_og = get_w_og(weights)
-    M = bs * s1 * s2
-    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16)
-    return compiledtrimul_fused_interleaved_final(
-        x=input_tensor.to(torch.float32),
-        mask_mh=mask_mh,
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        W_4way=W_4way,
-        W_og=W_og,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=H,
-    )

build/torch-xpu/triton_h100.py DELETED Viewed

@@ -1,509 +0,0 @@
-import torch
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16},  num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 16}, num_warps=4, num_stages=5),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 64}, num_warps=4, num_stages=5),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64,  'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'H_CHUNK_SIZE': 32}, num_warps=2, num_stages=4),
-    ],
-    key=['M', 'N', 'K'],
-)
-@triton.jit
-def fused_ln_dual_matmul_kernel(
-    # Pointers (9)
-    X_ptr, W_4way_ptr, W_og_ptr, Mask_ptr, Norm_Weight_ptr, Norm_Bias_ptr,
-    OutLeft_ptr, OutRight_ptr, OutOG_ptr,
-    # Metadata (5)
-    M, H, K, s1, s2,
-    # Strides (16)
-    stride_x_m, stride_x_k,
-    stride_w4_k, stride_w4_n,
-    stride_wog_k, stride_wog_n,
-    stride_ol_bs, stride_ol_h, stride_ol_s1, stride_ol_s2,
-    stride_or_t_bs, stride_or_t_h, stride_or_t_s2, stride_or_t_s1,
-    stride_og_m, stride_og_h,
-    stride_mask_m, stride_mask_h,
-    # Constexpr (from decorator and kwargs)
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr, H_CHUNK_SIZE: tl.constexpr,
-):
-    # --- PID Mapping: Based on the LARGER 4*H problem ---
-    pid = tl.program_id(axis=0)
-    N_4way = 4 * H
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N_4way, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    # --- SHARED LayerNorm calculation (done only ONCE) ---
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    m_mask = offs_m < M
-    x_rows_base_ptr = X_ptr + offs_m[:, None] * stride_x_m
-    mean = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        mean += tl.sum(x_chunk, axis=1)
-    mean /= K
-    var = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    for k_offset in range(0, K, BLOCK_SIZE_K):
-        k_chunk_offs = tl.arange(0, BLOCK_SIZE_K)
-        x_ptrs = x_rows_base_ptr + (k_offset + k_chunk_offs)[None, :]
-        k_mask = (k_offset + k_chunk_offs) < K
-        x_chunk = tl.load(x_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        x_centered = x_chunk - mean[:, None]
-        var += tl.sum(x_centered * x_centered, axis=1)
-    var /= K
-    rstd = 1.0 / tl.sqrt(var + LN_EPS)
-    # --- Matmul Loop 1: For the 4-Way Projections ---
-    offs_n_4way = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    w_4way_ptrs_base = W_4way_ptr + (offs_n_4way[None, :] * stride_w4_n)
-    accumulator_4way = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator_og = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    offs_n_og = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        k_block_start = k * BLOCK_SIZE_K;
-        x_ptrs = x_rows_base_ptr + (k_block_start + offs_k)[None, :] * stride_x_k
-        w_ptrs = w_4way_ptrs_base + (k_block_start + offs_k)[:, None] * stride_w4_k
-        x_mask = (offs_m[:, None] < M) & ((k_block_start + offs_k)[None, :] < K)
-        w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_4way[None, :] < N_4way)
-        x_tile = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float32)
-        norm_w_ptrs = Norm_Weight_ptr + k_block_start + offs_k
-        norm_b_ptrs = Norm_Bias_ptr + k_block_start + offs_k
-        nw = tl.load(norm_w_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        nb = tl.load(norm_b_ptrs, mask=(k_block_start + offs_k) < K, other=0.0)
-        x_norm_tile = (x_tile - mean[:, None]) * rstd[:, None]
-        x_norm_tile = (x_norm_tile * nw[None, :] + nb[None, :]).to(tl.float16)
-        w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-        accumulator_4way += tl.dot(x_norm_tile, w_tile)
-        #Some threads should calclate out_gate
-        if pid_n * BLOCK_SIZE_N < H:
-            w_og_ptrs_base = W_og_ptr + (offs_n_og[None, :] * stride_wog_n)
-            w_ptrs = w_og_ptrs_base + (k_block_start + offs_k)[:, None] * stride_wog_k
-            w_mask = ((k_block_start + offs_k)[:, None] < K) & (offs_n_og[None, :] < H);
-            w_tile = tl.load(w_ptrs, mask=w_mask, other=0.0)
-            accumulator_og += tl.dot(x_norm_tile, w_tile)
-    if pid_n * BLOCK_SIZE_N < H:
-        og_out = tl.sigmoid(accumulator_og)
-        outg_ptrs = OutOG_ptr + offs_m[:, None] * stride_og_m + offs_n_og[None, :] * stride_og_h
-        og_mask = m_mask[:, None] & (offs_n_og[None, :] < H)
-        tl.store(outg_ptrs, og_out, mask=og_mask)
-    # --- Fusion Logic for 4-Way Part ---
-    acc_reshaped = tl.reshape(accumulator_4way, (BLOCK_SIZE_M, H_CHUNK_SIZE, 4))
-    role_idx = tl.arange(0, 4)[None, None, :]
-    left_proj  = tl.sum(tl.where(role_idx == 0, acc_reshaped, 0.0), axis=2)
-    left_gate  = tl.sum(tl.where(role_idx == 1, acc_reshaped, 0.0), axis=2)
-    right_proj = tl.sum(tl.where(role_idx == 2, acc_reshaped, 0.0), axis=2)
-    right_gate = tl.sum(tl.where(role_idx == 3, acc_reshaped, 0.0), axis=2)
-    offs_h_chunk = (pid_n * H_CHUNK_SIZE) + tl.arange(0, H_CHUNK_SIZE)
-    mask_ptrs = Mask_ptr + offs_m[:, None] * stride_mask_m + offs_h_chunk[None, :] * stride_mask_h
-    m_mask_h = m_mask[:, None] & (offs_h_chunk[None, :] < H)
-    mask_tile = tl.load(mask_ptrs, mask=m_mask_h, other=0.0)
-    left_out = left_proj * tl.sigmoid(left_gate) * mask_tile
-    right_out = right_proj * tl.sigmoid(right_gate) * mask_tile
-    s1s2 = s1 * s2
-    offs_b  = offs_m // s1s2
-    offs_s1 = (offs_m % s1s2) // s2
-    offs_s2 = offs_m % s2
-    offs_b_2d  = tl.reshape(offs_b,  (BLOCK_SIZE_M, 1))
-    offs_h_2d  = tl.reshape(offs_h_chunk, (1, H_CHUNK_SIZE))
-    offs_s1_2d = tl.reshape(offs_s1, (BLOCK_SIZE_M, 1))
-    offs_s2_2d = tl.reshape(offs_s2, (BLOCK_SIZE_M, 1))
-    outl_ptrs = OutLeft_ptr + (offs_b_2d * stride_ol_bs + offs_h_2d * stride_ol_h +
-                                     offs_s1_2d * stride_ol_s1 + offs_s2_2d * stride_ol_s2)
-    outr_ptrs_t = OutRight_ptr + (offs_b_2d * stride_or_t_bs + offs_h_2d * stride_or_t_h +
-                                          offs_s2_2d * stride_or_t_s2 + offs_s1_2d * stride_or_t_s1) # s2 offset uses s2 stride, s1 offset uses s1 stride
-    tl.store(outl_ptrs, left_out, mask=m_mask_h)
-    tl.store(outr_ptrs_t, right_out, mask=m_mask_h)
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-    ],
-    key=['s1', 's2', 'H'],
-)
-@triton.jit
-def bmm_coalesced_kernel(
-    # Pointers
-    Left_ptr, Right_ptr, Out_ptr,
-    # Dimensions
-    bs, s1, s2, H,
-    # Strides
-    stride_l_bs, stride_l_h, stride_l_s1, stride_l_s2,
-    stride_r_bs, stride_r_h, stride_r_s2, stride_r_s1,
-    stride_o_bs, stride_o_h, stride_o_s1, stride_o_s2,
-    # Kernel parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # Grid and program IDs
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(s1, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(s1, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    pid_bh = tl.program_id(axis=1)
-    pid_b = pid_bh // H
-    pid_h = pid_bh % H
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    left_ptrs_base = Left_ptr + pid_b * stride_l_bs + pid_h * stride_l_h
-    right_ptrs_base = Right_ptr + pid_b * stride_r_bs + pid_h * stride_r_h
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(s2, BLOCK_SIZE_K)):
-        k_start = k * BLOCK_SIZE_K
-        a_ptrs = left_ptrs_base + (offs_m[:, None] * stride_l_s1 + (k_start + offs_k[None, :]) * stride_l_s2)
-        b_ptrs = right_ptrs_base + ((k_start + offs_k[:, None]) * stride_r_s2 + offs_n[None, :] * stride_r_s1)
-        a_mask = (offs_m[:, None] < s1) & ((k_start + offs_k[None, :]) < s2)
-        b_mask = ((k_start + offs_k[:, None]) < s2) & (offs_n[None, :] < s1)
-        a = tl.load(a_ptrs, mask=a_mask, other=0.0)
-        b = tl.load(b_ptrs, mask=b_mask, other=0.0)
-        accumulator += tl.dot(a, b)
-    # --- Coalesced Write ---
-    # Write to a standard (bs, H, s1, s1) layout
-    out_ptrs = Out_ptr + pid_b * stride_o_bs + pid_h * stride_o_h + \
-               offs_m[:, None] * stride_o_s1 + offs_n[None, :] * stride_o_s2
-    c_mask = (offs_m[:, None] < s1) & (offs_n[None, :] < s1)
-    tl.store(out_ptrs, accumulator, mask=c_mask)
-@torch.compile
-def torch_pt2(left_final, right_final_t, bs, s1, s2, d, h, to_out_norm_weight, to_out_norm_bias, og_mh, to_out_weight):
-    bmm_out = torch.matmul(left_final, right_final_t)
-    out_einsum_flat = bmm_out.permute(0, 2, 3, 1).reshape(bs * s1 * s1, h)
-    # Apply layer norm and final gating
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * og_mh
-    # Final projection
-    final_out_flat = gated @ to_out_weight.t()
-    final_out = final_out_flat.view(bs, s1, s2, d)
-    return final_out
-@triton.autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=3),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_warps=8, num_stages=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_warps=4, num_stages=4),
-    ],
-    key=['H', 'D'],
-)
-@triton.jit
-def fused_final_kernel(
-    # Pointers
-    In_ptr, Gate_ptr, NormW_ptr, NormB_ptr, ProjW_ptr, Out_ptr,
-    # Metadata
-    M, H, D, s1, # M_gate = bs*s1*s2
-    # Strides
-    stride_in_bs, stride_in_h, stride_in_s1_row, stride_in_s1_col,
-    stride_gate_m, stride_gate_h,
-    stride_proj_d, stride_proj_h,
-    stride_out_bs, stride_out_s1_row, stride_out_s1_col, stride_out_d,
-    # Constants
-    LN_EPS: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    # --- Grid and PID Setup for Matmul ---
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(D, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    m_mask = offs_m < M
-    # Decompose M back to (b, r, c) for reordering lookups
-    s1s1 = s1 * s1
-    b = offs_m // s1s1
-    r = (offs_m % s1s1) // s1
-    c = offs_m % s1
-    sum_x = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    sum_x2 = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
-    in_ptr_base = In_ptr + b * stride_in_bs + r * stride_in_s1_row + c * stride_in_s1_col
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        in_chunk = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0).to(tl.float32)
-        # Accumulate sum and sum of squares in one pass
-        sum_x += tl.sum(in_chunk, axis=1)
-        sum_x2 += tl.sum(in_chunk * in_chunk, axis=1)
-    # Finalize statistics
-    mean = sum_x / H
-    var = (sum_x2 / H) - (mean * mean)
-    rstd = tl.math.rsqrt(var + LN_EPS)
-    # --- Pass 3: Fused Gating and Matmul ---
-    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k_offset in range(0, H, BLOCK_SIZE_K):
-        offs_k = k_offset + tl.arange(0, BLOCK_SIZE_K)
-        k_mask = offs_k < H
-        in_ptrs = in_ptr_base[:, None] + offs_k[None, :] * stride_in_h
-        a = tl.load(in_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_norm = (a - mean[:, None]) * rstd[:, None]
-        norm_w = tl.load(NormW_ptr + offs_k, mask=k_mask, other=0.0)
-        norm_b = tl.load(NormB_ptr + offs_k, mask=k_mask, other=0.0)
-        a_norm = a_norm * norm_w[None, :] + norm_b[None, :]
-        proj_ptrs = ProjW_ptr + \
-                    offs_n[None, :] * stride_proj_d + \
-                    offs_k[:, None] * stride_proj_h
-        gate_ptrs = Gate_ptr + offs_m[:, None] * stride_gate_m + offs_k[None, :] * stride_gate_h
-        gate = tl.load(gate_ptrs, mask=m_mask[:, None] & k_mask[None, :], other=0.0)
-        a_gated = a_norm * gate
-        b_w = tl.load(proj_ptrs, mask=k_mask[:, None] & (offs_n[None, :] < D), other=0.0)
-        acc += tl.dot(a_gated.to(b_w.dtype), b_w)
-    # --- Store Final Output ---
-    offs_d = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    out_ptr_base = Out_ptr + b*stride_out_bs + r*stride_out_s1_row + c*stride_out_s1_col
-    out_ptrs = out_ptr_base[:, None] + offs_d[None, :] * stride_out_d
-    tl.store(out_ptrs, acc, mask=m_mask[:, None] & (offs_d[None, :] < D))
-def compiledtrimul_fused_interleaved(
-    x: torch.Tensor,
-    mask_mh: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    W_4way: torch.Tensor, # Use the new weight matrices
-    W_og: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int,
-):
-    bs, s1, s2, d = x.shape
-    M, K, H = bs * s1 * s2, x.shape[-1], h
-    x_flat = x.view(M, K)
-    left_final  = torch.empty((bs, H, s1, s2), device=x.device, dtype=torch.float16)
-    right_final_t = torch.empty((bs, H, s2, s1), device=x.device, dtype=torch.float16)
-    og_mh = torch.empty((M, H), device=x.device, dtype=torch.float16)
-    # The grid is launched for the larger 4*H problem
-    N_4way = 4 * H
-    grid = lambda meta: (triton.cdiv(M, meta['BLOCK_SIZE_M']) * triton.cdiv(N_4way, meta['BLOCK_SIZE_N']),)
-    fused_ln_dual_matmul_kernel[grid](
-        # Pointers (9)
-        x_flat, W_4way, W_og, mask_mh, norm_weight, norm_bias,
-        left_final, right_final_t, og_mh,
-        # Metadata (5) - M, H, K, s1, s2
-        M, H, K, s1, s2,
-        # Strides (16)
-        x_flat.stride(0), x_flat.stride(1),
-        W_4way.stride(0), W_4way.stride(1),
-        W_og.stride(0), W_og.stride(1),
-        left_final.stride(0), left_final.stride(1), left_final.stride(2), left_final.stride(3),
-        right_final_t.stride(0), right_final_t.stride(1), right_final_t.stride(2), right_final_t.stride(3),
-        og_mh.stride(0), og_mh.stride(1),
-        mask_mh.stride(0), mask_mh.stride(1),
-        # Constexpr (1)
-        LN_EPS=1e-5
-    )
-    return torch_pt2(
-        left_final, right_final_t,
-        bs=bs,
-        s1=s1,
-        s2=s2,
-        d=d,
-        h=h,
-        to_out_norm_weight=to_out_norm_weight,
-        to_out_norm_bias=to_out_norm_bias,
-        og_mh=og_mh,
-        to_out_weight=to_out_weight
-    )
-def pack_w_4way_efficient(weights):
-    """ Packs L, LG, R, RG into a tight [K, 4*H] matrix. """
-    WL = weights['left_proj.weight']
-    WLG = weights['left_gate.weight']
-    WR = weights['right_proj.weight']
-    WRG = weights['right_gate.weight']
-    H, K = WL.shape
-    ws = torch.stack([WL, WLG, WR, WRG], dim=0).permute(1, 0, 2)
-    ws = ws.contiguous().view(4 * H, K)
-    return ws.t().to(torch.float16)
-def get_w_og(weights):
-    """ Gets the transposed [K, H] out_gate weight matrix. """
-    WOG = weights['out_gate.weight']
-    return WOG.t().to(torch.float16)
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
-@torch.compile
-def compiledtrimul(
-    x: torch.Tensor,
-    mask: torch.Tensor,
-    norm_weight: torch.Tensor,
-    norm_bias: torch.Tensor,
-    w_concat: torch.Tensor,
-    to_out_norm_weight: torch.Tensor,
-    to_out_norm_bias: torch.Tensor,
-    to_out_weight: torch.Tensor,
-    h: int
-) -> torch.Tensor:
-    """
-    A barebones, compiled PyTorch function for the TriMul logic.
-    """
-    bs, s1, s2, d = x.shape
-    # Initial LayerNorm
-    x_norm = F.layer_norm(x, (d,), norm_weight, norm_bias).view((bs * s1 * s2, d)).to(torch.float16)
-    # Single large matmul: [M, d] @ [d, 5h] = [M, 5h]
-    all_projections = torch.mm(x_norm, w_concat)
-    # Split back into individual projections
-    left, right, lg, rg, og = all_projections.chunk(5, dim=1)
-    # Apply mask and gates
-    mask_expanded = mask.expand(-1, -1, -1, h).reshape(-1, h)
-    left = left * mask_expanded * torch.sigmoid(lg)
-    right = right * mask_expanded * torch.sigmoid(rg)
-    out_gate = torch.sigmoid(og)
-    # Reshape for einsum
-    left = left.view(bs, s1, s2, h).permute(0,3,1,2)
-    right = right.view(bs, s1, s2, h).permute(0,3,1,2)
-    out_p = torch.matmul(left.to(torch.float16), right.to(torch.float16).transpose(-1, -2))
-    out_einsum_flat = out_p.permute(0,2,3,1).reshape(bs * s1 * s1, h)
-    # Apply layer norm and final gating
-    normed = F.layer_norm(out_einsum_flat, (h,), to_out_norm_weight, to_out_norm_bias).to(torch.float16)
-    gated = normed * out_gate
-    # Final projection
-    final_out_flat = gated @ to_out_weight.t()
-    final_out = final_out_flat.view(bs, s1, s2, d)
-    return final_out
-def small_kernel_pt_path(data):
-    input_tensor, mask, weights, config = data
-    w_concat = torch.cat([
-        weights['left_proj.weight'],
-        weights['right_proj.weight'],
-        weights['left_gate.weight'],
-        weights['right_gate.weight'],
-        weights['out_gate.weight']
-    ], dim=0).t().contiguous().to(torch.float16)
-    # Call the compiled function with prepared weights
-    output = compiledtrimul(
-        x=input_tensor.to(torch.float32),
-        mask=mask.unsqueeze(-1),
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        w_concat=w_concat,
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float32),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float32),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=config["hidden_dim"]
-    )
-    return output
-def kernel_h100(data):
-    input_tensor, mask, weights, config = data
-    bs, s1, s2, d = input_tensor.shape
-    if s1 <= 512:
-        return small_kernel_pt_path(data)
-    H = config["hidden_dim"]
-    W_4way = pack_w_4way_efficient(weights)
-    W_og = get_w_og(weights)
-    M = bs * s1 * s2
-    mask_mh = mask.unsqueeze(-1).expand(-1, -1, -1, H).reshape(M, H).to(torch.float16) #move into kernel possibly
-    return compiledtrimul_fused_interleaved(
-        x=input_tensor.to(torch.float32),
-        mask_mh=mask_mh,
-        norm_weight=weights['norm.weight'].to(torch.float32),
-        norm_bias=weights['norm.bias'].to(torch.float32),
-        W_4way=W_4way, # Pass the new 4-way matrix
-        W_og=W_og,     # Pass the new out_gate matrix
-        to_out_norm_weight=weights['to_out_norm.weight'].to(torch.float16),
-        to_out_norm_bias=weights['to_out_norm.bias'].to(torch.float16),
-        to_out_weight=weights['to_out.weight'].to(torch.float16),
-        h=H,
-    )