Kernels:

kernels-community
/

liger-kernels

Trusted publisher

Kernel card Files Files and versions

xet

Community

kernels-bot commited on 9 days ago

Commit

0c953b9

verified ·

1 Parent(s): 741aabc

Uploaded using `kernel-builder`.

Browse files

Files changed (18) hide show

build/torch-cuda/__init__.py +2 -1
build/torch-cuda/_ops.py +1 -1
build/torch-cuda/cross_entropy.py +163 -65
build/torch-cuda/dyt.py +121 -182
build/torch-cuda/fused_linear_cross_entropy.py +152 -35
build/torch-cuda/geglu.py +7 -5
build/torch-cuda/group_norm.py +22 -16
build/torch-cuda/jsd.py +1 -1
build/torch-cuda/kl_div.py +9 -12
build/torch-cuda/layer_norm.py +139 -84
build/torch-cuda/layers.py +457 -33
build/torch-cuda/metadata.json +1 -1
build/torch-cuda/qwen2vl_mrope.py +1 -1
build/torch-cuda/rms_norm.py +390 -101
build/torch-cuda/rope.py +2 -2
build/torch-cuda/swiglu.py +75 -15
build/torch-cuda/tvd.py +18 -7
build/torch-cuda/utils.py +42 -1

build/torch-cuda/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from . import layers
-__all__ = ["layers"]

 from . import layers
+from .layers import CrossEntropyOutput, LigerForCausalLMLoss
+__all__ = ["layers", "LigerForCausalLMLoss", "CrossEntropyOutput"]

build/torch-cuda/_ops.py CHANGED Viewed

@@ -22,7 +22,7 @@ def get_backend() -> str:
 def _find_ops_name() -> str:
     kernel_name = "liger_kernels"
-    unique_id = "e29f7ec"
     backend = get_backend()
     return f"_{kernel_name}_{backend}_{unique_id}"

 def _find_ops_name() -> str:
     kernel_name = "liger_kernels"
+    unique_id = "08b4d53"
     backend = get_backend()
     return f"_{kernel_name}_{backend}_{unique_id}"

build/torch-cuda/cross_entropy.py CHANGED Viewed

@@ -10,8 +10,9 @@ from .utils import compare_version
 from .utils import element_mul_kernel
 from .utils import is_hip
 from .utils import infer_device
-if compare_version("triton", operator.ge, "3.0.0"):
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import tanh
@@ -32,6 +33,10 @@ def liger_cross_entropy_kernel(
     loss_ptr,
     z_loss_ptr,
     loss_stride,
     n_cols,
     n_non_ignore,
     sum_non_ignore_weight,
@@ -42,9 +47,12 @@ def liger_cross_entropy_kernel(
     reduction: tl.constexpr,  # set it as constexpr since reduction is always known at compile time
     softcap,
     RETURN_Z_LOSS: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
     HAS_WEIGHT: tl.constexpr,
     HAS_SOFTCAPPING: tl.constexpr,
 ):
     """
     This kernel computes both cross entropy loss and the gradient of the input.
@@ -59,6 +67,8 @@ def liger_cross_entropy_kernel(
     loss_ptr: Pointer to tensor to store the loss.
     z_loss_ptr: Pointer to tensor to store the z loss. No operation if RETURN_Z_LOSS is 0.
     loss_stride (int): The stride of the loss tensor.
     n_cols (int): The number of columns in the input tensor.
     n_non_ignore (float): The number of non-ignored elements in the batch.
     sum_non_ignore_weight (float): The sum of non-ignored target's weights in the batch.
@@ -68,10 +78,12 @@ def liger_cross_entropy_kernel(
     lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
     reduction (str): The string for the reduction to apply
     softcap (float): The upper threshold for scaling logits to the range (-softcap, +softcap).
-    RETURN_Z_LOSS (int): The boolean value to decide whether storing z loss to z_loss_ptr or not. It must be 0 or 1.
     BLOCK_SIZE (int): The block size for Triton operations.
     HAS_WEIGHT (bool): The boolean value to determine whether assigning weight to each of the classes.
     HAS_SOFTCAPPING (bool): The boolean value to determine whether applying soft-capping or not.
     """
     # https://github.com/triton-lang/triton/issues/1058
@@ -90,11 +102,22 @@ def liger_cross_entropy_kernel(
         for i in range(0, n_cols, BLOCK_SIZE):
             X_offsets = i + tl.arange(0, BLOCK_SIZE)
             tl.store(X_ptr + X_offsets, 0.0, mask=X_offsets < n_cols)
         return
     loss_ptr += program_id * loss_stride
     if RETURN_Z_LOSS:
         z_loss_ptr += program_id * loss_stride
     if HAS_WEIGHT:
         weight_y = tl.load(weight_ptr + y).cast(tl.float32)
@@ -105,6 +128,7 @@ def liger_cross_entropy_kernel(
     # 3. [Online softmax] first pass: find max + sum
     m = float("-inf")  # m is the max value. use the notation from the paper
     d = 0.0  # d is the sum. use the notation from the paper
     ori_X_y = tl.load(X_ptr + y).cast(tl.float32)  # we need to store the original value of X_y for the loss calculation
     if HAS_SOFTCAPPING:
         ori_X_y = softcap * tanh(ori_X_y / softcap)
@@ -125,6 +149,19 @@ def liger_cross_entropy_kernel(
         if HAS_SOFTCAPPING:
             X_block = softcap * tanh(X_block / softcap)
         block_max = tl.max(X_block)
         if label_smoothing > 0:
             # scale X beforehand to avoid overflow
             if HAS_WEIGHT:
@@ -155,58 +192,58 @@ def liger_cross_entropy_kernel(
     # For 'sum' reduction, no normalization is applied:
     # dx_y = softmax(x_y) - 1
     # dx_i = softmax(x_i), for i ≠ y
-    for i in range(0, n_cols, BLOCK_SIZE):
-        X_offsets = i + tl.arange(0, BLOCK_SIZE)
-        X_block = tl.load(
-            X_ptr + X_offsets,
-            mask=X_offsets < n_cols,
-            other=float("-inf"),
-            # Ensure float32 precision for softmax calculation
-        ).cast(tl.float32)
-        if HAS_SOFTCAPPING:
-            intermediate = tanh(X_block / softcap)
-            X_block = softcap * intermediate
-        if not HAS_WEIGHT:
-            # softmax(x_i)
-            X_block = tl.exp(X_block - m) / d
-            # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
-            X_block += 2 * lse_square_scale * lse * X_block
-            # smoothing term
-            X_block += -eps
-            # special handle dx_y
-            X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing))
-            # reduction scale
-            if reduction == "mean":
-                X_block = X_block / n_non_ignore
-        else:
-            weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
-            softmax_X = tl.exp(X_block - m) / d
-            # derivative of original_loss
-            dloss_ori = (1 - label_smoothing) * softmax_X
-            # specially handle dx_y
-            dloss_ori = tl.where(X_offsets != y, dloss_ori, dloss_ori - (1 - label_smoothing))
-            dloss_ori = dloss_ori * weight_y
-            # derivative of smooth_loss
-            dloss_smooth = eps * (-weight_block + softmax_X * weight_sum)
-            # derivative of z-loss
-            dz_loss = 2 * lse_square_scale * lse * softmax_X
-            # reduction scale
-            if reduction == "mean":
-                dloss_ori = dloss_ori / sum_non_ignore_weight
-                dloss_smooth = dloss_smooth / sum_non_ignore_weight
-                # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
-                dz_loss = dz_loss / n_non_ignore
-            # derivative of total_loss
-            X_block = dloss_ori + dloss_smooth + dz_loss
-        # chain rule softcapping
-        # d(softcap * tanh(x / softcap)) = (1 - tanh^2(x / softcap))
-        if HAS_SOFTCAPPING:
-            X_block = X_block * (1 - intermediate * intermediate)
-        tl.store(X_ptr + X_offsets, X_block, mask=X_offsets < n_cols)
     # We need tl.debug_barrier() to ensure the new result of X_ptr is written as mentioned in
     # https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34
@@ -254,12 +291,24 @@ def liger_cross_entropy_kernel(
     tl.store(loss_ptr, loss)
     if RETURN_Z_LOSS:
         tl.store(z_loss_ptr, z_loss)
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
 # The optimal maximum block size depends on your hardware, your kernel, and your dtype
-MAX_FUSED_SIZE = 4096 if infer_device() == "xpu" else 65536 // 2  # the best size we found by manually tuning
 def cross_entropy_forward(
@@ -272,8 +321,16 @@ def cross_entropy_forward(
     reduction,
     softcap,
     return_z_loss,
 ):
     assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
     BT, V = _input.shape
     n_rows = BT
@@ -283,6 +340,12 @@ def cross_entropy_forward(
     # unreduced loss
     loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device)
     z_loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device) if return_z_loss else None
     target_mask = target != ignore_index
     n_non_ignore = target_mask.sum().item()
@@ -319,6 +382,14 @@ def cross_entropy_forward(
         loss_ptr=loss_1d,
         z_loss_ptr=z_loss_1d,
         loss_stride=loss_1d.stride(-1),  # always 1
         n_cols=V,
         n_non_ignore=n_non_ignore,
         sum_non_ignore_weight=sum_non_ignore_weight,
@@ -329,9 +400,12 @@ def cross_entropy_forward(
         reduction=reduction,
         softcap=softcap,
         RETURN_Z_LOSS=return_z_loss,
         BLOCK_SIZE=BLOCK_SIZE,
         HAS_WEIGHT=True if weight is not None else False,
         HAS_SOFTCAPPING=True if softcap is not None else False,
         # TODO: 32 seems to give the best performance
         # Performance is quite sensitive to num_warps
         num_warps=32 if not is_hip() else 16,
@@ -340,11 +414,16 @@ def cross_entropy_forward(
     if reduction == "none":
         loss = loss_1d
         z_loss = z_loss_1d if return_z_loss else None
     else:
         loss = torch.sum(loss_1d)
         z_loss = torch.sum(z_loss_1d) if return_z_loss else None
-    return loss, z_loss, _input
 def cross_entropy_backward(_input, grad_output):
@@ -392,6 +471,8 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         reduction: str = "mean",
         softcap: Optional[float] = None,
         return_z_loss: bool = False,
     ):
         """
         The forward pass of the Liger Cross Entropy loss.
@@ -406,12 +487,16 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
         label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
         reduction (str): The reduction to apply to the output: "none" | "mean | "sum".
         softcap (Optional[float]): The upper threshold for scaling logits to the range (-softcap, +softcap).
-        return_z_loss (bool): When `return_z_loss` is `True`, returns (loss, z_loss) instead of (loss, None). Default: `False`
         Returns:
-        tuple: A tuple with the compouted losses with respect to loss and z loss. The elements are tensors or None.
         """
-        loss, z_loss, _input = cross_entropy_forward(
             _input,
             target,
             weight,
@@ -421,29 +506,40 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
             reduction,
             softcap,
             return_z_loss,
         )
         # TODO: investigation
         # If we don't detach the _input tensor, the memory will double
         # Not sure why but seems that there will be a time both grad and value exist but in different location
-        ctx.save_for_backward(_input.detach())
         ctx.return_z_loss = return_z_loss
-        return loss, z_loss
     @staticmethod
-    def backward(ctx, grad_output, grad_ouput2):
         """
         The backward pass of the Liger Cross Entropy loss.
         Parameters:
         ctx : The context object with saved tensors.
         grad_output (tensor): The tensor containing the gradient of the loss with respect to the output.
-        grad_output2 (tenosr): No use.
         Returns:
         tuple: A tuple with the gradients with respect to the inputs. The elements are tensors or None.
         """
         if ctx.return_z_loss:
-            del grad_ouput2  # z_loss is only for logging
         (_input,) = ctx.saved_tensors
         _input = cross_entropy_backward(_input, grad_output)
@@ -457,4 +553,6 @@ class LigerCrossEntropyFunction(torch.autograd.Function):
             None,
             None,
             None,
-        )

 from .utils import element_mul_kernel
 from .utils import is_hip
 from .utils import infer_device
+from .utils import is_npu_available
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import tanh
     loss_ptr,
     z_loss_ptr,
     loss_stride,
+    token_accuracy_ptr,
+    token_accuracy_stride,
+    predicted_tokens_ptr,
+    predicted_tokens_stride,
     n_cols,
     n_non_ignore,
     sum_non_ignore_weight,
     reduction: tl.constexpr,  # set it as constexpr since reduction is always known at compile time
     softcap,
     RETURN_Z_LOSS: tl.constexpr,
+    RETURN_TOKEN_ACCURACY: tl.constexpr,
+    RETURN_PREDICTED_TOKENS: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
     HAS_WEIGHT: tl.constexpr,
     HAS_SOFTCAPPING: tl.constexpr,
+    HAS_GRADIENTS: tl.constexpr,
 ):
     """
     This kernel computes both cross entropy loss and the gradient of the input.
     loss_ptr: Pointer to tensor to store the loss.
     z_loss_ptr: Pointer to tensor to store the z loss. No operation if RETURN_Z_LOSS is 0.
     loss_stride (int): The stride of the loss tensor.
+    token_accuracy_ptr: Pointer to tensor to store the per-token accuracy. No operation if RETURN_TOKEN_ACCURACY is 0.
+    token_accuracy_stride (int): The stride of the token accuracy tensor.
     n_cols (int): The number of columns in the input tensor.
     n_non_ignore (float): The number of non-ignored elements in the batch.
     sum_non_ignore_weight (float): The sum of non-ignored target's weights in the batch.
     lse_square_scale (float): The scaler of (logsumexp(_input)) ^ 2 adding to the loss for the stability of training.
     reduction (str): The string for the reduction to apply
     softcap (float): The upper threshold for scaling logits to the range (-softcap, +softcap).
+    RETURN_Z_LOSS (int): The boolean value to decide whether to store z loss to z_loss_ptr or not. It must be 0 or 1.
+    RETURN_TOKEN_ACCURACY (int): The boolean value to decide whether to store per-token accuracy to token_accuracy_ptr or not. It must be 0 or 1.
     BLOCK_SIZE (int): The block size for Triton operations.
     HAS_WEIGHT (bool): The boolean value to determine whether assigning weight to each of the classes.
     HAS_SOFTCAPPING (bool): The boolean value to determine whether applying soft-capping or not.
+    HAS_GRADIENTS (bool): The boolean value to determine whether calculating gradients in forward pass.
     """
     # https://github.com/triton-lang/triton/issues/1058
         for i in range(0, n_cols, BLOCK_SIZE):
             X_offsets = i + tl.arange(0, BLOCK_SIZE)
             tl.store(X_ptr + X_offsets, 0.0, mask=X_offsets < n_cols)
+        # For ignored tokens, set token accuracy to 0
+        if RETURN_TOKEN_ACCURACY:
+            token_accuracy_ptr += program_id * token_accuracy_stride
+            tl.store(token_accuracy_ptr, 0.0)
+        if RETURN_PREDICTED_TOKENS:
+            predicted_tokens_ptr += program_id * predicted_tokens_stride
+            tl.store(predicted_tokens_ptr, -1)
         return
     loss_ptr += program_id * loss_stride
     if RETURN_Z_LOSS:
         z_loss_ptr += program_id * loss_stride
+    if RETURN_TOKEN_ACCURACY:
+        token_accuracy_ptr += program_id * token_accuracy_stride
+    if RETURN_PREDICTED_TOKENS:
+        predicted_tokens_ptr += program_id * predicted_tokens_stride
     if HAS_WEIGHT:
         weight_y = tl.load(weight_ptr + y).cast(tl.float32)
     # 3. [Online softmax] first pass: find max + sum
     m = float("-inf")  # m is the max value. use the notation from the paper
     d = 0.0  # d is the sum. use the notation from the paper
+    argmax_idx = 0  # Track the index of the maximum value for token accuracy / predicted tokens computation
     ori_X_y = tl.load(X_ptr + y).cast(tl.float32)  # we need to store the original value of X_y for the loss calculation
     if HAS_SOFTCAPPING:
         ori_X_y = softcap * tanh(ori_X_y / softcap)
         if HAS_SOFTCAPPING:
             X_block = softcap * tanh(X_block / softcap)
         block_max = tl.max(X_block)
+        # Track argmax for accuracy / predicted tokens computation
+        if RETURN_TOKEN_ACCURACY or RETURN_PREDICTED_TOKENS:
+            # Find the index of the maximum value in this block
+            is_max_mask = X_block == block_max
+            # Mask out invalid indices with a value larger than n_cols
+            masked_offsets = tl.where(is_max_mask, X_offsets, n_cols)
+            # Get the first (smallest) index where max occurs
+            current_block_argmax_idx = tl.min(masked_offsets)
+            is_new_max = block_max > m
+            argmax_idx = tl.where(is_new_max, current_block_argmax_idx, argmax_idx)
         if label_smoothing > 0:
             # scale X beforehand to avoid overflow
             if HAS_WEIGHT:
     # For 'sum' reduction, no normalization is applied:
     # dx_y = softmax(x_y) - 1
     # dx_i = softmax(x_i), for i ≠ y
+    if HAS_GRADIENTS:
+        for i in range(0, n_cols, BLOCK_SIZE):
+            X_offsets = i + tl.arange(0, BLOCK_SIZE)
+            X_block = tl.load(
+                X_ptr + X_offsets,
+                mask=X_offsets < n_cols,
+                other=float("-inf"),
+                # Ensure float32 precision for softmax calculation
+            ).cast(tl.float32)
+            if HAS_SOFTCAPPING:
+                intermediate = tanh(X_block / softcap)
+                X_block = softcap * intermediate
+            if not HAS_WEIGHT:
+                # softmax(x_i)
+                X_block = tl.exp(X_block - m) / d
+                # derivative of z-loss: 2 * lse_square_scale * lse * softmax(x_i)
+                X_block += 2 * lse_square_scale * lse * X_block
+                # smoothing term
+                X_block += -eps
+                # special handle dx_y
+                X_block = tl.where(X_offsets != y, X_block, X_block - (1 - label_smoothing))
+                # reduction scale
+                if reduction == "mean":
+                    X_block = X_block / n_non_ignore
+            else:
+                weight_block = tl.load(weight_ptr + X_offsets, mask=X_offsets < n_cols)
+                softmax_X = tl.exp(X_block - m) / d
+                # derivative of original_loss
+                dloss_ori = (1 - label_smoothing) * softmax_X
+                # specially handle dx_y
+                dloss_ori = tl.where(X_offsets != y, dloss_ori, dloss_ori - (1 - label_smoothing))
+                dloss_ori = dloss_ori * weight_y
+                # derivative of smooth_loss
+                dloss_smooth = eps * (-weight_block + softmax_X * weight_sum)
+                # derivative of z-loss
+                dz_loss = 2 * lse_square_scale * lse * softmax_X
+                # reduction scale
+                if reduction == "mean":
+                    dloss_ori = dloss_ori / sum_non_ignore_weight
+                    dloss_smooth = dloss_smooth / sum_non_ignore_weight
+                    # TODO: Implement weighted z_loss. Currently, z_loss is not scaled by weight.
+                    dz_loss = dz_loss / n_non_ignore
+                # derivative of total_loss
+                X_block = dloss_ori + dloss_smooth + dz_loss
+            # chain rule softcapping
+            # d(softcap * tanh(x / softcap)) = (1 - tanh^2(x / softcap))
+            if HAS_SOFTCAPPING:
+                X_block = X_block * (1 - intermediate * intermediate)
+            tl.store(X_ptr + X_offsets, X_block, mask=X_offsets < n_cols)
     # We need tl.debug_barrier() to ensure the new result of X_ptr is written as mentioned in
     # https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34
     tl.store(loss_ptr, loss)
     if RETURN_Z_LOSS:
         tl.store(z_loss_ptr, z_loss)
+    if RETURN_TOKEN_ACCURACY:
+        # Store 1.0 if prediction is correct, 0.0 otherwise
+        is_correct = 1.0 if argmax_idx == y else 0.0
+        tl.store(token_accuracy_ptr, is_correct)
+    if RETURN_PREDICTED_TOKENS:
+        tl.store(predicted_tokens_ptr, argmax_idx)
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
 # The optimal maximum block size depends on your hardware, your kernel, and your dtype
+# the best size we found by manually tuning on xpu and npu.
+if infer_device() == "xpu":
+    MAX_FUSED_SIZE = 4096
+elif infer_device() == "npu":
+    MAX_FUSED_SIZE = 2048
+else:
+    MAX_FUSED_SIZE = 65536 // 2
 def cross_entropy_forward(
     reduction,
     softcap,
     return_z_loss,
+    return_token_accuracy=False,
+    return_predicted_tokens=False,
 ):
     assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
+    assert isinstance(return_token_accuracy, bool), (
+        f"return_token_accuracy must be True or False. Got: {return_token_accuracy}"
+    )
+    assert isinstance(return_predicted_tokens, bool), (
+        f"return_predicted_tokens must be True or False. Got: {return_predicted_tokens}"
+    )
     BT, V = _input.shape
     n_rows = BT
     # unreduced loss
     loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device)
     z_loss_1d = torch.zeros(n_rows, dtype=_input.dtype, device=_input.device) if return_z_loss else None
+    token_accuracy_1d = (
+        torch.zeros(n_rows, dtype=torch.float32, device=_input.device) if return_token_accuracy else None
+    )
+    predicted_tokens_1d = (
+        torch.full((n_rows,), -1, dtype=torch.int64, device=_input.device) if return_predicted_tokens else None
+    )
     target_mask = target != ignore_index
     n_non_ignore = target_mask.sum().item()
         loss_ptr=loss_1d,
         z_loss_ptr=z_loss_1d,
         loss_stride=loss_1d.stride(-1),  # always 1
+        token_accuracy_ptr=token_accuracy_1d,
+        token_accuracy_stride=token_accuracy_1d.stride(-1)
+        if return_token_accuracy
+        else 0,  # always 1 if accuracy is enabled
+        predicted_tokens_ptr=predicted_tokens_1d,
+        predicted_tokens_stride=predicted_tokens_1d.stride(-1)
+        if return_predicted_tokens
+        else 0,  # always 1 if predicted tokens is enabled
         n_cols=V,
         n_non_ignore=n_non_ignore,
         sum_non_ignore_weight=sum_non_ignore_weight,
         reduction=reduction,
         softcap=softcap,
         RETURN_Z_LOSS=return_z_loss,
+        RETURN_TOKEN_ACCURACY=return_token_accuracy,
+        RETURN_PREDICTED_TOKENS=return_predicted_tokens,
         BLOCK_SIZE=BLOCK_SIZE,
         HAS_WEIGHT=True if weight is not None else False,
         HAS_SOFTCAPPING=True if softcap is not None else False,
+        HAS_GRADIENTS=_input.requires_grad,
         # TODO: 32 seems to give the best performance
         # Performance is quite sensitive to num_warps
         num_warps=32 if not is_hip() else 16,
     if reduction == "none":
         loss = loss_1d
         z_loss = z_loss_1d if return_z_loss else None
+        token_accuracy = token_accuracy_1d if return_token_accuracy else None
     else:
         loss = torch.sum(loss_1d)
         z_loss = torch.sum(z_loss_1d) if return_z_loss else None
+        # For accuracy, we compute the mean across all non-ignored tokens
+        token_accuracy = torch.sum(token_accuracy_1d) / n_non_ignore if return_token_accuracy else None
+    predicted_tokens = predicted_tokens_1d if return_predicted_tokens else None
+    return loss, z_loss, token_accuracy, predicted_tokens, _input
 def cross_entropy_backward(_input, grad_output):
         reduction: str = "mean",
         softcap: Optional[float] = None,
         return_z_loss: bool = False,
+        return_token_accuracy: bool = False,
+        return_predicted_tokens: bool = False,
     ):
         """
         The forward pass of the Liger Cross Entropy loss.
         label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
         reduction (str): The reduction to apply to the output: "none" | "mean | "sum".
         softcap (Optional[float]): The upper threshold for scaling logits to the range (-softcap, +softcap).
+        return_z_loss (bool): When `return_z_loss` is `True`, returns (loss, z_loss, token_accuracy, predicted_tokens) instead of (loss, None, None, None). Default: `False`
+        return_token_accuracy (bool): When `return_token_accuracy` is `True`, computes and returns per-token accuracy without materializing logits. Default: `False`
+        return_predicted_tokens (bool): When `return_predicted_tokens` is `True`, returns per-token predicted class indices (argmax) without materializing logits. Default: `False`
         Returns:
+        tuple: A tuple with the computed losses, accuracy, and predicted tokens: (loss, z_loss, token_accuracy, predicted_tokens). z_loss, token_accuracy, and predicted_tokens are None if not requested.
         """
+        input_requires_grad = _input.requires_grad
+        loss, z_loss, token_accuracy, predicted_tokens, _input = cross_entropy_forward(
             _input,
             target,
             weight,
             reduction,
             softcap,
             return_z_loss,
+            return_token_accuracy,
+            return_predicted_tokens,
         )
         # TODO: investigation
         # If we don't detach the _input tensor, the memory will double
         # Not sure why but seems that there will be a time both grad and value exist but in different location
+        if input_requires_grad:
+            ctx.save_for_backward(_input.detach())
         ctx.return_z_loss = return_z_loss
+        ctx.return_token_accuracy = return_token_accuracy
+        ctx.return_predicted_tokens = return_predicted_tokens
+        return loss, z_loss, token_accuracy, predicted_tokens
     @staticmethod
+    def backward(ctx, grad_output, grad_output2, grad_output3, grad_output4):
         """
         The backward pass of the Liger Cross Entropy loss.
         Parameters:
         ctx : The context object with saved tensors.
         grad_output (tensor): The tensor containing the gradient of the loss with respect to the output.
+        grad_output2 (tensor): No use. Gradient for z_loss (not used as z_loss is only for logging).
+        grad_output3 (tensor): No use. Gradient for token_accuracy (not used as token_accuracy is only for metrics).
+        grad_output4 (tensor): No use. Gradient for predicted_tokens (not used as predicted_tokens is only for metrics).
         Returns:
         tuple: A tuple with the gradients with respect to the inputs. The elements are tensors or None.
         """
         if ctx.return_z_loss:
+            del grad_output2  # z_loss is only for logging
+        if ctx.return_token_accuracy:
+            del grad_output3  # token_accuracy is only for metrics
+        if ctx.return_predicted_tokens:
+            del grad_output4  # predicted_tokens is only for metrics
         (_input,) = ctx.saved_tensors
         _input = cross_entropy_backward(_input, grad_output)
             None,
             None,
             None,
+            None,
+            None,
+        )

build/torch-cuda/dyt.py CHANGED Viewed

@@ -4,12 +4,13 @@ import torch
 import triton
 import triton.language as tl
-from .utils import calculate_settings
 from .utils import compare_version
 from .utils import ensure_contiguous
 from .utils import infer_device
-if compare_version("triton", operator.ge, "3.0.0"):
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import tanh
@@ -20,187 +21,131 @@ else:
     from triton.language.math import tanh
 @triton.jit
-def _dyt_fwd_kernel(
-    x_ptr,
-    x_row_stride,
-    alpha_ptr,
-    gamma_ptr,
-    beta_ptr,
-    y_ptr,
-    y_row_stride,
-    n_cols,
-    BLOCK_SIZE: tl.constexpr,
-):
-    """
-    Reference:
-    https://arxiv.org/abs/2503.10622
-    Shapes:
-        - x: (BT, C)
-        - alpha: (1)
-        - gamma: (C)
-        - beta: (C)
-    """
-    row_idx = tl.program_id(0)
-    offsets = tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_cols
-    x_ptr += row_idx * x_row_stride
-    y_ptr += row_idx * y_row_stride
-    alpha = tl.load(alpha_ptr)
-    gamma = tl.load(gamma_ptr + offsets, mask=mask)
-    beta = tl.load(beta_ptr + offsets, mask=mask)
-    x = tl.load(x_ptr + offsets, mask=mask)
-    y = gamma * tanh((alpha * x).cast(tl.float32)) + beta
-    tl.store(y_ptr + offsets, y, mask=mask)
 @triton.jit
 def _dyt_bwd_kernel(
-    x_ptr,
-    x_row_stride,
-    dy_ptr,
-    dy_row_stride,
-    dx_ptr,
-    dx_row_stride,
-    alpha_ptr,
-    dalpha_ptr,
-    gamma_ptr,
-    dgamma_ptr,
-    dgamma_row_stride,
-    n_cols,
-    n_rows,
-    ROWS_PER_PROGRAM: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
 ):
-    """
-    Reference:
-    https://arxiv.org/abs/2503.10622
-    Shapes:
-        - x: (BT, C)
-        - alpha: (1)
-        - gamma: (C)
-        - dx: (BT, C)
-        - dy: (BT, C)
-        - dgamma: (sm_count, C)
-        - dalpha: (sm_count,)
-    """
-    # d(gamma * tanh(alpha * x) + beta) / dx
-    # = gamma * (1 - tanh^2(alpha * x)) * alpha
-    # d(gamma * tanh(alpha * x) + beta) / dalpha
-    # = gamma * (1 - tanh^2(alpha * x)) * x
-    # d(gamma * tanh(alpha * x) + beta) / dgamma
-    # = tanh(alpha * x)
-    # d(gamma * tanh(alpha * x)) / dbeta = 1
-    pid = tl.program_id(0)
-    row_start = pid * ROWS_PER_PROGRAM
-    row_end = min((pid + 1) * ROWS_PER_PROGRAM, n_rows)
-    offsets = tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_cols
-    dalpha = 0.0
-    dgamma = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
-    x_ptr += row_start * x_row_stride
-    dx_ptr += row_start * dx_row_stride
-    dy_ptr += row_start * dy_row_stride
-    alpha = tl.load(alpha_ptr)
-    gamma = tl.load(gamma_ptr + offsets, mask=mask, other=0.0)
-    for _ in tl.range(row_start, row_end):
-        dy = tl.load(dy_ptr + offsets, mask=mask, other=0.0)
-        x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
-        tanh_ax = tanh((alpha * x).cast(tl.float32))
-        sech2_ax = 1 - tanh_ax * tanh_ax
-        dx = dy * gamma * sech2_ax * alpha
-        dalpha += tl.sum(dy * gamma * sech2_ax * x)
-        dgamma += dy * tanh_ax
-        tl.store(dx_ptr + offsets, dx, mask=mask)
-        dy_ptr += dy_row_stride
-        x_ptr += x_row_stride
-        dx_ptr += dx_row_stride
-    tl.store(dgamma_ptr + pid * dgamma_row_stride + offsets, dgamma, mask=mask)
-    tl.store(dalpha_ptr + pid, dalpha)
-    pass
 def liger_dyt_fwd(x, alpha, gamma, beta):
-    shape = x.shape
-    dim = shape[-1]
-    x = x.view(-1, dim)
-    n_rows, n_cols = x.shape
     y = torch.empty_like(x)
-    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
-    _dyt_fwd_kernel[(n_rows,)](
-        x_ptr=x,
-        alpha_ptr=alpha,
-        gamma_ptr=gamma,
-        beta_ptr=beta,
-        y_ptr=y,
-        x_row_stride=x.stride(0),
-        y_row_stride=y.stride(0),
-        n_cols=n_cols,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
     )
-    return y.view(*shape)
-def liger_dyt_bwd(dy, x, alpha, gamma):
-    shape = dy.shape
-    dtype = x.dtype
-    dim = shape[-1]
-    dy = dy.view(-1, dim)
-    x = x.view(-1, dim)
-    n_rows, n_cols = dy.shape
-    BLOCK_SIZE, num_warps = calculate_settings(n_cols)
-    sm_count = 1
     device = infer_device()
     if device == "cuda":
-        sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
     elif device == "xpu":
-        sm_count = torch.xpu.get_device_properties(x.device).gpu_subslice_count
-    if n_cols > BLOCK_SIZE:
-        raise RuntimeError(
-            f"Feature dimension {dim} exceeds maximum supported size of {BLOCK_SIZE}. Consider using a smaller feature dimension."
-        )
-    dx = torch.empty_like(x, dtype=torch.float32)
-    _dalpha = torch.empty((sm_count,), dtype=torch.float32, device=x.device)
-    _dgamma = torch.empty((sm_count, n_cols), dtype=torch.float32, device=x.device)
-    grid = (sm_count,)
-    rows_per_program = triton.cdiv(n_rows, sm_count)
-    _dyt_bwd_kernel[grid](
-        x_ptr=x,
-        x_row_stride=x.stride(0),
-        dy_ptr=dy,
-        dy_row_stride=dy.stride(0),
-        dx_ptr=dx,
-        dx_row_stride=dx.stride(0),
-        alpha_ptr=alpha,
-        dalpha_ptr=_dalpha,
-        gamma_ptr=gamma,
-        dgamma_ptr=_dgamma,
-        dgamma_row_stride=_dgamma.stride(0),
-        n_cols=n_cols,
-        n_rows=n_rows,
-        ROWS_PER_PROGRAM=rows_per_program,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
-    )
-    dalpha = _dalpha.sum(dim=0, keepdim=True).to(dtype)
-    dgamma = _dgamma.sum(dim=0).to(dtype)
-    dbeta = dy.sum(dim=0).to(dtype)
-    return dx.view(*shape), dalpha, dgamma, dbeta
 class LigerDyTFunction(torch.autograd.Function):
@@ -208,18 +153,12 @@ class LigerDyTFunction(torch.autograd.Function):
     @ensure_contiguous
     def forward(ctx, x, alpha, gamma, beta):
         y = liger_dyt_fwd(x, alpha, gamma, beta)
-        ctx.save_for_backward(x, alpha, gamma)
         return y
     @staticmethod
     @ensure_contiguous
-    def backward(ctx, grad_output):
-        x, alpha, gamma = ctx.saved_tensors
-        dx, dalpha, dgamma, dbeta = liger_dyt_bwd(
-            grad_output,
-            x,
-            alpha,
-            gamma,
-        )
-        return (dx, dalpha, dgamma, dbeta)

 import triton
 import triton.language as tl
 from .utils import compare_version
 from .utils import ensure_contiguous
+from .utils import get_npu_core_count
 from .utils import infer_device
+from .utils import is_npu_available
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import tanh
     from triton.language.math import tanh
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_N": bn}, num_stages=ns, num_warps=nw)
+        for bn in [1024, 2048, 4096]
+        for ns in [1, 2]
+        for nw in [4, 8, 16]
+    ],
+    key=["N"],
+)
 @triton.jit
+def _dyt_fwd_kernel(X, Y, Alpha, Gamma, Beta, HAVE_BETA: tl.constexpr, N: tl.constexpr, BLOCK_N: tl.constexpr):
+    col = tl.cast(tl.program_id(0), tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask = col < N
+    row_id = tl.cast(tl.program_id(1), tl.int64)
+    X += row_id * N
+    Y += row_id * N
+    alpha = tl.load(Alpha).to(tl.float32)
+    gamma = tl.load(Gamma + col, mask=mask, other=0.0).to(tl.float32)
+    x = tl.load(X + col, mask=mask, other=0.0).to(tl.float32)
+    tanh_x = tanh(alpha * x)
+    y = tanh_x * gamma
+    if HAVE_BETA:
+        beta = tl.load(Beta + col, mask=mask, other=0.0).to(tl.float32)
+        y += beta
+    tl.store(Y + col, y, mask=mask)
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_N": bn}, num_stages=ns, num_warps=nw)
+        for bn in [1024, 2048, 4096]
+        for ns in [1, 2]
+        for nw in [4, 8, 16]
+    ],
+    key=["N"],
+    # DA is indexed by program_id(0), so different BLOCK_N configs write to
+    # different slot counts per SM. Autotune trials don't zero outputs between
+    # runs, so stale slots from a prior trial would leak into da.sum(). Reset
+    # DA between trials to isolate each config's writes.
+    reset_to_zero=["DA"],
+)
 @triton.jit
 def _dyt_bwd_kernel(
+    DY, DX, DA, DG, DB, X, Alpha, Gamma, HAVE_BETA: tl.constexpr, M, N: tl.constexpr, BLOCK_N: tl.constexpr
 ):
+    col = tl.cast(tl.program_id(0), tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask = col < N
+    start_row_id = tl.cast(tl.program_id(1), tl.int64)
+    alpha = tl.load(Alpha).to(tl.float32)
+    da = 0.0
+    gamma = tl.load(Gamma + col, mask=mask, other=0.0).to(tl.float32)
+    dg = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    if HAVE_BETA:
+        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    for row_id in range(start_row_id, M, tl.num_programs(1)):
+        x = tl.load(X + row_id * N + col, mask=mask, other=0.0).to(tl.float32)
+        dy = tl.load(DY + row_id * N + col, mask=mask, other=0.0).to(tl.float32)
+        tanh_x = tanh(alpha * x)
+        if HAVE_BETA:
+            db += dy
+        dg += dy * tanh_x
+        tmp = (1 - tanh_x * tanh_x) * dy * gamma
+        da += tl.sum(x * tmp, 0)
+        dx = alpha * tmp
+        tl.store(DX + row_id * N + col, dx, mask=mask)
+    tl.store(DG + start_row_id * N + col, dg, mask=mask)
+    if HAVE_BETA:
+        tl.store(DB + start_row_id * N + col, db, mask=mask)
+    tl.store(DA + start_row_id * tl.cdiv(N, 512) + tl.program_id(0), da)
 def liger_dyt_fwd(x, alpha, gamma, beta):
+    assert x.is_contiguous()
+    HAVE_BETA = True if beta is not None else False
+    input_shape = x.shape
+    x = x.view(-1, input_shape[-1])
+    M, N = x.shape
     y = torch.empty_like(x)
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]), M)
+    _dyt_fwd_kernel[grid](
+        x,
+        y,
+        alpha,
+        gamma,
+        beta,
+        HAVE_BETA,
+        N,
     )
+    return y.view(input_shape)
+def liger_dyt_bwd(dy, x, alpha, gamma, beta):
+    assert dy.is_contiguous()
+    input_shape = x.shape
+    x = x.view(-1, input_shape[-1])
+    M, N = x.shape
+    HAVE_BETA = True if beta is not None else False
     device = infer_device()
     if device == "cuda":
+        NUM_SMS = torch.cuda.get_device_properties(x.device).multi_processor_count
     elif device == "xpu":
+        NUM_SMS = torch.xpu.get_device_properties(x.device).gpu_subslice_count
+    elif device == "npu":
+        NUM_SMS = get_npu_core_count()
+    da = torch.zeros(NUM_SMS, triton.cdiv(N, 512), dtype=torch.float32, device=x.device)
+    dg = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device)
+    db = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device) if HAVE_BETA else None
+    dx = torch.empty_like(dy)
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]), NUM_SMS)
+    _dyt_bwd_kernel[grid](dy, dx, da, dg, db, x, alpha, gamma, HAVE_BETA, M, N)
+    if HAVE_BETA:
+        db = db.sum(0).to(x.dtype)
+    dg = dg.sum(0).to(gamma.dtype)
+    da = da.sum().to(x.dtype).unsqueeze(0)
+    return dx.view(input_shape), da, dg, db
 class LigerDyTFunction(torch.autograd.Function):
     @ensure_contiguous
     def forward(ctx, x, alpha, gamma, beta):
         y = liger_dyt_fwd(x, alpha, gamma, beta)
+        ctx.save_for_backward(x, alpha, gamma, beta)
         return y
     @staticmethod
     @ensure_contiguous
+    def backward(ctx, dy):
+        x, alpha, gamma, beta = ctx.saved_tensors
+        dx, dalpha, dgamma, dbeta = liger_dyt_bwd(dy, x, alpha, gamma, beta)
+        return dx, dalpha, dgamma, dbeta

build/torch-cuda/fused_linear_cross_entropy.py CHANGED Viewed

@@ -6,11 +6,12 @@ from .utils import amp_custom_bwd
 from .utils import amp_custom_fwd
 from .utils import element_mul_kernel
 from .utils import is_hip
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
 # The optimal maximum block size depends on your hardware, your kernel, and your dtype
-MAX_FUSED_SIZE = 65536 // 2
 def fused_linear_cross_entropy_forward(
@@ -25,10 +26,22 @@ def fused_linear_cross_entropy_forward(
     reduction="mean",
     softcap=None,
     return_z_loss=False,
 ):
     assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
     device = _input.device
     # inputs have shape: BT x H
     # materialized activations will have shape: BT x V
     # the increase in memory = BT x V
@@ -44,12 +57,24 @@ def fused_linear_cross_entropy_forward(
     chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor))  # (BT + inc_factor - 1) // inc_factor
     num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
-    grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
     grad_input = torch.zeros_like(_input, device=device)
-    grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
-    # we use fp32 for loss accumulator
     loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
     z_loss_1d = torch.zeros(BT, dtype=_input.dtype, device=_input.device) if return_z_loss else None
     # TODO: evaluate how CUDA synchronization caused by .item() affects the speed
     target_mask = target != ignore_index
@@ -82,9 +107,41 @@ def fused_linear_cross_entropy_forward(
         n_rows = logits_chunk.shape[0]
         # unreduced loss
         loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
         z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
         # ensure _input and target are contiguous
         logits_chunk = logits_chunk.contiguous()
@@ -100,6 +157,14 @@ def fused_linear_cross_entropy_forward(
             loss_ptr=loss_1d_slice,
             z_loss_ptr=z_loss_1d_slice,
             loss_stride=loss_1d_slice.stride(-1),  # always 1
             n_cols=V,
             n_non_ignore=total_n_non_ignore,
             sum_non_ignore_weight=total_sum_non_ignore_ce_weight,
@@ -110,35 +175,46 @@ def fused_linear_cross_entropy_forward(
             reduction=reduction,
             softcap=softcap,
             RETURN_Z_LOSS=return_z_loss,
             HAS_WEIGHT=True if ce_weight is not None else False,
             HAS_SOFTCAPPING=True if softcap is not None else False,
             BLOCK_SIZE=BLOCK_SIZE,
             num_warps=32 if not is_hip() else 16,
         )
         loss_1d[start_idx:end_idx] = loss_1d_slice
         if return_z_loss:
             z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
         grad_logits_chunk = logits_chunk  # chunk_size x V
-        grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
-        if grad_weight is not None:
-            torch.addmm(
-                input=grad_weight,
-                mat1=logits_chunk.t().to(
-                    _input_chunk.dtype
-                ),  # In an autocast scenario without bias, differing logits_chunk data types will cause an addmm operation error.
-                mat2=_input_chunk,
-                out=grad_weight,
-                alpha=1.0,
-                beta=1.0,
-            )
-        if bias is not None:
             torch.add(
                 input=grad_bias,
-                other=logits_chunk.sum(dim=0),
                 out=grad_bias,
                 alpha=1.0,
             )
@@ -148,10 +224,24 @@ def fused_linear_cross_entropy_forward(
     #     loss = loss_1d
     #     z_loss = z_loss_1d if return_z_loss else None
     else:
         loss = torch.sum(loss_1d)
         z_loss = torch.sum(z_loss_1d) if return_z_loss else None
-    return loss, z_loss, grad_input, grad_weight, grad_bias
 def fused_linear_cross_entropy_backward(grad_output, grad_input, grad_weight, grad_bias):
@@ -217,6 +307,10 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         reduction="mean",
         softcap=None,
         return_z_loss: bool = False,
     ):
         """
         Fusing the last linear layer with cross-entropy loss
@@ -235,35 +329,54 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
         ignore_index: the index to ignore in the target
         label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
         reduction: reduction to apply
         """
-        loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
-            _input=_input,
-            weight=weight,
-            target=target,
-            bias=bias,
-            ce_weight=ce_weight,
-            ignore_index=ignore_index,
-            lse_square_scale=lse_square_scale,
-            label_smoothing=label_smoothing,
-            reduction=reduction,
-            softcap=softcap,
-            return_z_loss=return_z_loss,
         )
         # downcast to dtype and store for backward
         ctx.save_for_backward(
             grad_input.detach(),
             grad_weight.detach() if grad_weight is not None else None,
-            grad_bias.detach() if bias is not None else None,
         )
         ctx.return_z_loss = return_z_loss
-        return loss, z_loss
     @staticmethod
     @amp_custom_bwd
-    def backward(ctx, grad_output, grad_output2):
         if ctx.return_z_loss:
             del grad_output2  # z_loss is only for logging
         (grad_input, grad_weight, grad_bias) = ctx.saved_tensors
         grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward(
             grad_output, grad_input, grad_weight, grad_bias
@@ -280,4 +393,8 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
             None,
             None,
             None,
-        )

 from .utils import amp_custom_fwd
 from .utils import element_mul_kernel
 from .utils import is_hip
+from .utils import infer_device
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
 # The optimal maximum block size depends on your hardware, your kernel, and your dtype
+MAX_FUSED_SIZE = 2048 if infer_device() == "npu" else 65536 // 2
 def fused_linear_cross_entropy_forward(
     reduction="mean",
     softcap=None,
     return_z_loss=False,
+    accum_dtype=None,
+    use_token_scaling=False,
+    return_token_accuracy=False,
+    return_predicted_tokens=False,
 ):
     assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
+    assert isinstance(return_token_accuracy, bool), (
+        f"return_token_accuracy must be True or False. Got: {return_token_accuracy}"
+    )
+    assert isinstance(return_predicted_tokens, bool), (
+        f"return_predicted_tokens must be True or False. Got: {return_predicted_tokens}"
+    )
     device = _input.device
+    input_requires_grad = _input.requires_grad
     # inputs have shape: BT x H
     # materialized activations will have shape: BT x V
     # the increase in memory = BT x V
     chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor))  # (BT + inc_factor - 1) // inc_factor
     num_chunks = triton.cdiv(BT, chunk_size)  # (BT + chunk_size - 1) // chunk_size
     grad_input = torch.zeros_like(_input, device=device)
+    # we use fp32 for loss and gradients accumulator
+    if input_requires_grad:
+        if accum_dtype is None:
+            grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
+            grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
+        else:
+            grad_weight = torch.zeros_like(weight, dtype=accum_dtype, device=device) if weight.requires_grad else None
+            grad_bias = torch.zeros_like(bias, dtype=accum_dtype, device=device) if bias is not None else None
+    else:
+        grad_weight = None
+        grad_bias = None
     loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
     z_loss_1d = torch.zeros(BT, dtype=_input.dtype, device=_input.device) if return_z_loss else None
+    token_accuracy_1d = torch.zeros(BT, dtype=torch.float32, device=device) if return_token_accuracy else None
+    predicted_tokens_1d = torch.full((BT,), -1, dtype=torch.int64, device=device) if return_predicted_tokens else None
     # TODO: evaluate how CUDA synchronization caused by .item() affects the speed
     target_mask = target != ignore_index
         n_rows = logits_chunk.shape[0]
+        # Compute predicted probabilities for token scaling if needed
+        if use_token_scaling:
+            # Compute softmax probabilities for scaling
+            # We need to compute this before the cross entropy kernel modifies logits_chunk
+            logits_for_softmax = logits_chunk.detach().clone()  # Detach to avoid gradient flow
+            if softcap is not None:
+                logits_for_softmax = softcap * torch.tanh(logits_for_softmax / softcap)
+            # Compute softmax to get predicted probabilities
+            probs = torch.softmax(logits_for_softmax, dim=-1)
+            # Get predicted probabilities for token scaling, handling ignored targets
+            valid_target_mask = target_chunk != ignore_index
+            valid_targets = target_chunk[valid_target_mask]
+            if len(valid_targets) > 0:
+                # Gather probabilities only for valid targets
+                valid_probs = probs[valid_target_mask]
+                pred_probs_valid = torch.gather(valid_probs, -1, valid_targets.unsqueeze(-1)).squeeze(-1)
+                # Create full tensor with zeros for ignored targets
+                pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
+                pred_probs[valid_target_mask] = pred_probs_valid
+            else:
+                # All targets are ignored
+                pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
+            # Store the scaling factors
+            scaling_factors = pred_probs.detach()  # Detach to ensure no gradient flow
         # unreduced loss
         loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
         z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
+        token_accuracy_1d_slice = token_accuracy_1d[start_idx:end_idx] if return_token_accuracy else None
+        predicted_tokens_1d_slice = predicted_tokens_1d[start_idx:end_idx] if return_predicted_tokens else None
         # ensure _input and target are contiguous
         logits_chunk = logits_chunk.contiguous()
             loss_ptr=loss_1d_slice,
             z_loss_ptr=z_loss_1d_slice,
             loss_stride=loss_1d_slice.stride(-1),  # always 1
+            token_accuracy_ptr=token_accuracy_1d_slice,
+            token_accuracy_stride=token_accuracy_1d_slice.stride(-1)
+            if return_token_accuracy
+            else 0,  # always 1 if accuracy is enabled
+            predicted_tokens_ptr=predicted_tokens_1d_slice,
+            predicted_tokens_stride=predicted_tokens_1d_slice.stride(-1)
+            if return_predicted_tokens
+            else 0,  # always 1 if predicted tokens is enabled
             n_cols=V,
             n_non_ignore=total_n_non_ignore,
             sum_non_ignore_weight=total_sum_non_ignore_ce_weight,
             reduction=reduction,
             softcap=softcap,
             RETURN_Z_LOSS=return_z_loss,
+            RETURN_TOKEN_ACCURACY=return_token_accuracy,
+            RETURN_PREDICTED_TOKENS=return_predicted_tokens,
             HAS_WEIGHT=True if ce_weight is not None else False,
             HAS_SOFTCAPPING=True if softcap is not None else False,
+            HAS_GRADIENTS=input_requires_grad,
             BLOCK_SIZE=BLOCK_SIZE,
             num_warps=32 if not is_hip() else 16,
         )
+        # Apply token scaling if requested
+        if use_token_scaling:
+            loss_1d_slice = loss_1d_slice * scaling_factors
+            if return_z_loss:
+                z_loss_1d_slice = z_loss_1d_slice * scaling_factors
         loss_1d[start_idx:end_idx] = loss_1d_slice
         if return_z_loss:
             z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
+        if return_token_accuracy:
+            token_accuracy_1d[start_idx:end_idx] = token_accuracy_1d_slice
+        if return_predicted_tokens:
+            predicted_tokens_1d[start_idx:end_idx] = predicted_tokens_1d_slice
         grad_logits_chunk = logits_chunk  # chunk_size x V
+        # Apply token scaling to gradients if requested
+        if use_token_scaling:
+            # Expand scaling factors to match gradient dimensions
+            scaling_factors_expanded = scaling_factors.unsqueeze(-1)  # chunk_size x 1
+            grad_logits_chunk = grad_logits_chunk * scaling_factors_expanded
+        if input_requires_grad:
+            grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
+        if grad_weight is not None and input_requires_grad:
+            grad_weight += torch.mm(grad_logits_chunk.t(), _input_chunk).float()
+        if bias is not None and input_requires_grad:
             torch.add(
                 input=grad_bias,
+                other=grad_logits_chunk.sum(dim=0),
                 out=grad_bias,
                 alpha=1.0,
             )
     #     loss = loss_1d
     #     z_loss = z_loss_1d if return_z_loss else None
+    if reduction == "none":
+        # Return per-token losses
+        loss = loss_1d
+        z_loss = z_loss_1d if return_z_loss else None
+        token_accuracy = token_accuracy_1d if return_token_accuracy else None
     else:
         loss = torch.sum(loss_1d)
         z_loss = torch.sum(z_loss_1d) if return_z_loss else None
+        # For accuracy, we compute the mean across all non-ignored tokens
+        token_accuracy = torch.sum(token_accuracy_1d) / total_n_non_ignore if return_token_accuracy else None
+    predicted_tokens = predicted_tokens_1d if return_predicted_tokens else None
+    # Cast back to original dtype
+    grad_weight = grad_weight.to(weight.dtype) if grad_weight is not None else None
+    grad_bias = grad_bias.to(bias.dtype) if grad_bias is not None else None
+    return loss, z_loss, token_accuracy, predicted_tokens, grad_input, grad_weight, grad_bias
 def fused_linear_cross_entropy_backward(grad_output, grad_input, grad_weight, grad_bias):
         reduction="mean",
         softcap=None,
         return_z_loss: bool = False,
+        accum_dtype=None,
+        use_token_scaling: bool = False,
+        return_token_accuracy: bool = False,
+        return_predicted_tokens: bool = False,
     ):
         """
         Fusing the last linear layer with cross-entropy loss
         ignore_index: the index to ignore in the target
         label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
         reduction: reduction to apply
+        accum_dtype (torch.dtype): the dtype of intermediate result buffers for weight and bias gradient accumulations.
+            Recommended to set `accum_dtype` to higher precision, e.g. `torch.float32`, if the training is unstable with original dtype. Default: `None`, performing accumulations in original dtype
+        use_token_scaling (bool): whether to scale each token's loss by its predicted probability (detached).
+            When True, each token's loss is multiplied by the model's predicted probability for that token's true class.
+            Default: False.
+        return_token_accuracy (bool): When `return_token_accuracy` is `True`, computes and returns per-token accuracy without materializing logits. Default: `False`
+        return_predicted_tokens (bool): When `return_predicted_tokens` is `True`, returns per-token predicted class indices (argmax) without materializing logits. Default: `False`
         """
+        loss, z_loss, token_accuracy, predicted_tokens, grad_input, grad_weight, grad_bias = (
+            fused_linear_cross_entropy_forward(
+                _input=_input,
+                weight=weight,
+                target=target,
+                bias=bias,
+                ce_weight=ce_weight,
+                ignore_index=ignore_index,
+                lse_square_scale=lse_square_scale,
+                label_smoothing=label_smoothing,
+                reduction=reduction,
+                softcap=softcap,
+                return_z_loss=return_z_loss,
+                accum_dtype=accum_dtype,
+                use_token_scaling=use_token_scaling,
+                return_token_accuracy=return_token_accuracy,
+                return_predicted_tokens=return_predicted_tokens,
+            )
         )
         # downcast to dtype and store for backward
         ctx.save_for_backward(
             grad_input.detach(),
             grad_weight.detach() if grad_weight is not None else None,
+            grad_bias.detach() if grad_bias is not None else None,
         )
         ctx.return_z_loss = return_z_loss
+        ctx.return_token_accuracy = return_token_accuracy
+        ctx.return_predicted_tokens = return_predicted_tokens
+        return loss, z_loss, token_accuracy, predicted_tokens
     @staticmethod
     @amp_custom_bwd
+    def backward(ctx, grad_output, grad_output2, grad_output3, grad_output4):
         if ctx.return_z_loss:
             del grad_output2  # z_loss is only for logging
+        if ctx.return_token_accuracy:
+            del grad_output3  # token_accuracy is only for metrics
+        if ctx.return_predicted_tokens:
+            del grad_output4  # predicted_tokens is only for metrics
         (grad_input, grad_weight, grad_bias) = ctx.saved_tensors
         grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward(
             grad_output, grad_input, grad_weight, grad_bias
             None,
             None,
             None,
+            None,
+            None,  # use_token_scaling
+            None,  # return_token_accuracy
+            None,  # return_predicted_tokens
+        )

build/torch-cuda/geglu.py CHANGED Viewed

@@ -7,8 +7,9 @@ import triton.language as tl
 from .utils import calculate_settings
 from .utils import compare_version
 from .utils import ensure_contiguous
-if compare_version("triton", operator.ge, "3.0.0"):
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import tanh
@@ -40,7 +41,7 @@ def _geglu_tanh_forward_kernel(a, b, c, stride, n_cols: tl.constexpr, BLOCK_SIZE
     tanh_arg = sqrt_2_over_pi * (a_row + 0.044715 * a_cubed)
     tanh_result = tanh(tanh_arg)
     geglu_a = 0.5 * a_row * (1 + tanh_result)
-    c_row = geglu_a * b_row
     tl.store(c + col_offsets, c_row, mask=mask)
@@ -66,8 +67,9 @@ def _geglu_tanh_backward_kernel(dc, a, b, stride, n_cols: tl.constexpr, BLOCK_SI
     tanh_arg = sqrt_2_over_pi * (a_row + 0.044715 * a_cubed)
     tanh_result = tanh(tanh_arg)
     geglu_a = 0.5 * a_row * (1 + tanh_result)
-    db_row = dc_row * geglu_a
     # Gradient w.r.t. a can be computed with:
     # b * (0.5 * (1 + tanh(z)) + 0.5 * a * (1 - tanh(z)^2) * (sqrt(2/pi) * (1 + 3 * 0.044715 * a^2)))
@@ -78,7 +80,7 @@ def _geglu_tanh_backward_kernel(dc, a, b, stride, n_cols: tl.constexpr, BLOCK_SI
     da_row = dc_row * b_row * (term1 + term2)
     tl.store(a + col_offsets, da_row, mask=mask)
-    tl.store(b + col_offsets, db_row, mask=mask)
 def geglu_forward(a, b):
@@ -138,4 +140,4 @@ class LigerGELUMulFunction(torch.autograd.Function):
     def backward(ctx, dc):
         a, b = ctx.saved_tensors
         a, b = geglu_backward(a, b, dc)
-        return a, b

 from .utils import calculate_settings
 from .utils import compare_version
 from .utils import ensure_contiguous
+from .utils import is_npu_available
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import tanh
     tanh_arg = sqrt_2_over_pi * (a_row + 0.044715 * a_cubed)
     tanh_result = tanh(tanh_arg)
     geglu_a = 0.5 * a_row * (1 + tanh_result)
+    c_row = geglu_a.cast(b_row.dtype) * b_row
     tl.store(c + col_offsets, c_row, mask=mask)
     tanh_arg = sqrt_2_over_pi * (a_row + 0.044715 * a_cubed)
     tanh_result = tanh(tanh_arg)
     geglu_a = 0.5 * a_row * (1 + tanh_result)
+    geglu_a = geglu_a.to(dc_row.dtype).to(tl.float32)
+    db_row = dc_row.cast(tl.float32) * geglu_a
     # Gradient w.r.t. a can be computed with:
     # b * (0.5 * (1 + tanh(z)) + 0.5 * a * (1 - tanh(z)^2) * (sqrt(2/pi) * (1 + 3 * 0.044715 * a^2)))
     da_row = dc_row * b_row * (term1 + term2)
     tl.store(a + col_offsets, da_row, mask=mask)
+    tl.store(b + col_offsets, db_row.to(dc_row.dtype), mask=mask)
 def geglu_forward(a, b):
     def backward(ctx, dc):
         a, b = ctx.saved_tensors
         a, b = geglu_backward(a, b, dc)
+        return a, b

build/torch-cuda/group_norm.py CHANGED Viewed

@@ -6,8 +6,10 @@ import triton.language as tl
 from .utils import compare_version
 from .utils import ensure_contiguous
-if compare_version("triton", operator.ge, "3.0.0"):
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
@@ -17,7 +19,10 @@ if compare_version("triton", operator.ge, "3.0.0"):
 else:
     from triton.language.math import rsqrt
-MAX_FUSED_SIZE = 65536
 @triton.jit
@@ -72,20 +77,21 @@ def _group_norm_forward_kernel(
     # 1/std
     rstd = rsqrt(variance + eps)
-    # Normalize
     hidden_size_per_channel = hidden_size // channels_per_group
-    for channel_idx in tl.range(group_idx * channels_per_group, (group_idx + 1) * channels_per_group):
-        W = tl.load(W_ptr + channel_idx)
-        B = tl.load(B_ptr + channel_idx)
-        for i in range(0, hidden_size_per_channel, BLOCK_SIZE):
-            hidden_size_offsets = i + block_range
-            mask = hidden_size_offsets < hidden_size_per_channel
-            X = tl.load(X_ptr + hidden_size_offsets, mask=mask, other=m)
-            Y = (X - m) * rstd * W + B
-            tl.store(Y_ptr + hidden_size_offsets, Y, mask=mask)
-        X_ptr += hidden_size_per_channel
-        Y_ptr += hidden_size_per_channel
     tl.store(Mean_ptr + batch_idx * Mean_row_stride + group_idx * Mean_col_stride, m)
     tl.store(RSTD_ptr + batch_idx * RSTD_row_stride + group_idx * RSTD_col_stride, rstd)
@@ -302,4 +308,4 @@ class LigerGroupNormFunction(torch.autograd.Function):
     def backward(ctx, dY):
         X, W, B, Mean, RSTD = ctx.saved_tensors
         DX, DW, DB = group_norm_backward(dY, X, W, B, Mean, RSTD, ctx.num_channels, ctx.num_groups)
-        return DX, DW, DB, None, None, None

 from .utils import compare_version
 from .utils import ensure_contiguous
+from .utils import infer_device
+from .utils import is_npu_available
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
 else:
     from triton.language.math import rsqrt
+if infer_device() == "npu":
+    MAX_FUSED_SIZE = 16384  # 8192
+else:
+    MAX_FUSED_SIZE = 65536
 @triton.jit
     # 1/std
     rstd = rsqrt(variance + eps)
+    # Normalize — flat loop over full hidden_size (not per-channel)
+    # This avoids the nested channel × per_channel_hidden loop where
+    # BLOCK_SIZE >> hidden_size_per_channel causes massive padding waste.
     hidden_size_per_channel = hidden_size // channels_per_group
+    for i in tl.range(0, hidden_size, BLOCK_SIZE):
+        hidden_size_offsets = i + block_range
+        mask = hidden_size_offsets < hidden_size
+        X = tl.load(X_ptr + hidden_size_offsets, mask=mask, other=m)
+        # Determine which channel each element belongs to, then load W/B
+        local_channel = hidden_size_offsets // hidden_size_per_channel
+        global_channel = group_idx * channels_per_group + local_channel
+        W = tl.load(W_ptr + global_channel, mask=mask)
+        B = tl.load(B_ptr + global_channel, mask=mask)
+        Y = (X - m) * rstd * W + B
+        tl.store(Y_ptr + hidden_size_offsets, Y, mask=mask)
     tl.store(Mean_ptr + batch_idx * Mean_row_stride + group_idx * Mean_col_stride, m)
     tl.store(RSTD_ptr + batch_idx * RSTD_row_stride + group_idx * RSTD_col_stride, rstd)
     def backward(ctx, dY):
         X, W, B, Mean, RSTD = ctx.saved_tensors
         DX, DW, DB = group_norm_backward(dY, X, W, B, Mean, RSTD, ctx.num_channels, ctx.num_groups)
+        return DX, DW, DB, None, None, None

build/torch-cuda/jsd.py CHANGED Viewed

@@ -198,4 +198,4 @@ class LigerJSDFunction(torch.autograd.Function):
             None,
             None,
             None,
-        )

             None,
             None,
             None,
+        )

build/torch-cuda/kl_div.py CHANGED Viewed

@@ -21,7 +21,12 @@ def get_num_warps(BLOCK_SIZE):
     return num_warps
-MAX_FUSED_SIZE = 65536 // 4  # 65536 // 4 or 8 works the best
 REDUCTION_LITERAL = Literal["none", "sum", "mean", "batchmean"]
@@ -116,11 +121,7 @@ def _kldiv_kernel_backward(
 def kldiv_forward_triton(y_pred, y_true, log_target, reduction, eps):  # [BT, V]
     BT, V = y_pred.shape
-    BLOCK_SIZE = (
-        min(8192, triton.next_power_of_2(V))
-        if infer_device() == "xpu"
-        else min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
-    )
     num_warps = 32 if infer_device() == "xpu" else get_num_warps(BLOCK_SIZE)
     grid = (BT,)
@@ -159,11 +160,7 @@ def kldiv_forward_triton(y_pred, y_true, log_target, reduction, eps):  # [BT, V]
 def kldiv_backward_triton(target, grad_output, new_grads, log_target):
     BT, V = target.shape
-    BLOCK_SIZE = (
-        min(8192, triton.next_power_of_2(V))
-        if infer_device() == "xpu"
-        else min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
-    )
     num_warps = 32 if infer_device() == "xpu" else get_num_warps(BLOCK_SIZE)
     grid = (BT,)
@@ -259,4 +256,4 @@ class LigerKLDivLossFunction(torch.autograd.Function):
             None,
             None,
             None,
-        )

     return num_warps
+if infer_device() == "xpu":
+    MAX_FUSED_SIZE = 8192
+elif infer_device() == "npu":
+    MAX_FUSED_SIZE = 8192
+else:
+    MAX_FUSED_SIZE = 65536 // 4  # 65536 // 4 or 8 works the best
 REDUCTION_LITERAL = Literal["none", "sum", "mean", "batchmean"]
 def kldiv_forward_triton(y_pred, y_true, log_target, reduction, eps):  # [BT, V]
     BT, V = y_pred.shape
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
     num_warps = 32 if infer_device() == "xpu" else get_num_warps(BLOCK_SIZE)
     grid = (BT,)
 def kldiv_backward_triton(target, grad_output, new_grads, log_target):
     BT, V = target.shape
+    BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
     num_warps = 32 if infer_device() == "xpu" else get_num_warps(BLOCK_SIZE)
     grid = (BT,)
             None,
             None,
             None,
+        )

build/torch-cuda/layer_norm.py CHANGED Viewed

@@ -8,8 +8,11 @@ import triton.language as tl
 from .utils import calculate_settings
 from .utils import compare_version
 from .utils import ensure_contiguous
-if compare_version("triton", operator.ge, "3.0.0"):
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
@@ -43,111 +46,151 @@ def _layer_norm_forward_kernel(
     https://arxiv.org/abs/1607.06450
     https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
     """
-    row_idx = tl.program_id(0)
     col_offsets = tl.arange(0, BLOCK_SIZE)
     mask = col_offsets < n_cols
-    Y_ptr += row_idx * Y_row_stride
-    X_ptr += row_idx * X_row_stride
-    Mean_ptr += row_idx * Mean_row_stride
-    RSTD_ptr += row_idx * RSTD_row_stride
-    X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
-    W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
-    B_row = tl.load(B_ptr + col_offsets, mask=mask, other=0)
-    mean = tl.sum(X_row, axis=0) / n_cols
-    Xmm = tl.where(mask, X_row - mean, 0)
-    var = tl.sum(Xmm * Xmm, axis=0) / n_cols
     rstd = rsqrt(var + eps)
-    tl.store(Mean_ptr, mean)
-    tl.store(RSTD_ptr, rstd)
-    Y_row = Xmm * rstd * W_row + B_row
-    tl.store(Y_ptr + col_offsets, Y_row, mask=mask)
 @triton.jit
 def _layer_norm_backward_kernel(
     X_ptr,  # pointer to input, shape (n_rows, n_cols)
     W_ptr,  # pointer to weights, shape (n_cols,)
     Mean_ptr,  # pointer to mean, shape (n_rows,)
     RSTD_ptr,  # pointer to rstd, shape (n_rows,)
     DX_ptr,  # pointer to input grad, shape (n_rows, n_cols)
-    DW_ptr,  # pointer to weights grad, shape (n_cols,)
-    DB_ptr,  # pointer to bias grad, shape (n_cols,)
-    DY_ptr,  # pointer to output grad, shape (n_rows, n_cols)
-    stride_x,  # stride of each row in input
     stride_dx,  # stride of each row in input grad
     stride_dw,  # stride of each row in weights grad
     stride_db,  # stride of each row in bias grad
     stride_dy,  # stride of each row in output grad
     n_rows,
     n_cols,
     rows_per_program: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
-    dtype: tl.constexpr,
 ):
     """
     References:
     https://arxiv.org/abs/1607.06450
     https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
-    https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
-    https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/triton/layer_norm.py
     """
-    row_block_id = tl.program_id(0)
     row_start = row_block_id * rows_per_program
     row_end = min((row_block_id + 1) * rows_per_program, n_rows)
     cols = tl.arange(0, BLOCK_SIZE)
     mask = cols < n_cols
-    dw_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
     db_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
-    X_ptr += row_start * stride_x
-    Mean_ptr += row_start
-    RSTD_ptr += row_start
-    DX_ptr += row_start * stride_dx
-    DY_ptr += row_start * stride_dy
-    for _ in range(row_start, row_end):
-        x = tl.load(X_ptr + cols, mask=mask, other=0.0)
-        w = tl.load(W_ptr + cols, mask=mask, other=0.0)
-        dy = tl.load(DY_ptr + cols, mask=mask, other=0.0)
-        mean = tl.load(Mean_ptr)
-        rstd = tl.load(RSTD_ptr)
-        x_hat = (x - mean) * rstd
-        wdy = w * dy
         c1 = tl.sum(x_hat * wdy, axis=0) / n_cols
         c2 = tl.sum(wdy, axis=0) / n_cols
-        dx = (wdy - (x_hat * c1 + c2)) * rstd
-        tl.store(DX_ptr + cols, dx.to(dtype), mask=mask)
-        dw_row += dy * x_hat
-        db_row += dy
-        X_ptr += stride_x
-        Mean_ptr += 1
-        RSTD_ptr += 1
-        DX_ptr += stride_dx
-        DY_ptr += stride_dy
-    tl.store(DW_ptr + row_block_id * stride_dw + cols, dw_row.to(dtype), mask=mask)
-    tl.store(DB_ptr + row_block_id * stride_db + cols, db_row.to(dtype), mask=mask)
 def layer_norm_forward(X, W, B, eps):
     shape = X.shape
     dim = shape[-1]
     X = X.view(-1, dim)
     n_rows, n_cols = X.shape
     BLOCK_SIZE, num_warps = calculate_settings(n_cols)
     Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
     Mean = torch.empty(n_rows, dtype=X.dtype, device=X.device)
     RSTD = torch.empty(n_rows, dtype=X.dtype, device=X.device)
     if X.shape[1] != W.shape[0]:
         raise ValueError(
             f"Incompatible dimensions: input feature size (X.shape[1]={X.shape[1]}) "
@@ -157,9 +200,11 @@ def layer_norm_forward(X, W, B, eps):
     # XPU-specific optimization
     kernel_args = {}
     if X.device.type == "xpu":
-        kernel_args["grf_mode"] = "large"
-    _layer_norm_forward_kernel[(n_rows,)](
         Y,
         Y.stride(0),
         X,
@@ -176,12 +221,25 @@ def layer_norm_forward(X, W, B, eps):
         eps,
         BLOCK_SIZE=BLOCK_SIZE,
         num_warps=num_warps,
-        **kernel_args,  # XPU-specific optimization
     )
     return Y.view(*shape), X, Mean, RSTD, BLOCK_SIZE, num_warps
 def layer_norm_backward(dY, X, W, B, Mean, RSTD):
     shape = dY.shape
     dim = shape[-1]
     dY = dY.view(-1, dim)
@@ -192,60 +250,57 @@ def layer_norm_backward(dY, X, W, B, Mean, RSTD):
         sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
     elif X.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
-    DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
-    _DW = torch.empty((sm_count, n_cols), dtype=W.dtype, device=W.device)
-    _DB = torch.empty((sm_count, n_cols), dtype=W.dtype, device=W.device)
     BLOCK_SIZE, num_warps = calculate_settings(n_cols)
     if n_cols > BLOCK_SIZE:
-        raise RuntimeError(
-            f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}. Consider using a smaller feature dimension."
-        )
     rows_per_program = math.ceil(n_rows / sm_count)
     grid = (sm_count,)
-    triton_dtype = (
-        tl.float32
-        if X.dtype == torch.float32
-        else tl.bfloat16
-        if X.dtype == torch.bfloat16
-        else tl.float16
-        if X.dtype == torch.float16
-        else tl.float32  # fallback to float32 for other types
-    )
     # XPU-specific optimization
-    kernel_args = {}
     if X.device.type == "xpu":
-        kernel_args.update({"grf_mode": "large", "num_warps": 32, "num_stages": 4})
     _layer_norm_backward_kernel[grid](
         X,
         W,
         Mean,
         RSTD,
         DX,
-        _DW,
-        _DB,
-        dY,
-        X.stride(0),
         DX.stride(0),
         _DW.stride(0),
         _DB.stride(0),
         dY.stride(0),
         n_rows,
         n_cols,
-        rows_per_program,
         BLOCK_SIZE=BLOCK_SIZE,
-        dtype=triton_dtype,
-        **kernel_args,  # XPU-specific optimization
     )
     DW = _DW.sum(dim=0).to(W.dtype)
-    DB = _DB.sum(dim=0).to(W.dtype)
-    DX = DX.view(*shape)
     return DX, DW, DB
@@ -262,4 +317,4 @@ class LigerLayerNormFunction(torch.autograd.Function):
     def backward(ctx, dY):
         X, W, B, Mean, RSTD = ctx.saved_tensors
         DX, DW, DB = layer_norm_backward(dY, X, W, B, Mean, RSTD)
-        return DX, DW, DB, None

 from .utils import calculate_settings
 from .utils import compare_version
 from .utils import ensure_contiguous
+from .utils import get_npu_core_count
+from .utils import set_large_grf_mode
+from .utils import is_npu_available
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
     https://arxiv.org/abs/1607.06450
     https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
     """
+    row_idx = tl.program_id(0).to(tl.int64)
     col_offsets = tl.arange(0, BLOCK_SIZE)
     mask = col_offsets < n_cols
+    # Pre-load weights and bias in fp32 to avoid repeated conversions
+    W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
+    B_row = tl.load(B_ptr + col_offsets, mask=mask, other=0.0)
+    W_f32 = W_row.to(tl.float32)
+    B_f32 = B_row.to(tl.float32)
+    # Calculate pointers for this row
+    row_X_ptr = X_ptr + row_idx * X_row_stride
+    row_Y_ptr = Y_ptr + row_idx * Y_row_stride
+    row_Mean_ptr = Mean_ptr + row_idx * Mean_row_stride
+    row_RSTD_ptr = RSTD_ptr + row_idx * RSTD_row_stride
+    # Load input data and convert to fp32 for numerical stability
+    X_row = tl.load(row_X_ptr + col_offsets, mask=mask, other=0.0)
+    X_f32 = X_row.to(tl.float32)
+    # Compute statistics in fp32 for numerical stability
+    mean = tl.sum(X_f32, axis=0) / n_cols
+    X_centered = X_f32 - mean
+    # Apply mask to variance calculation to exclude contributions from masked elements
+    X_centered_masked = tl.where(mask, X_centered, 0.0)
+    var = tl.sum(X_centered_masked * X_centered_masked, axis=0) / n_cols
     rstd = rsqrt(var + eps)
+    # Store statistics (convert back to original dtype only once)
+    tl.store(row_Mean_ptr, mean.to(X_row.dtype))
+    tl.store(row_RSTD_ptr, rstd.to(X_row.dtype))
+    # Fused normalization and affine transformation
+    # Y = (X - mean) * rstd * W + B = X_centered * rstd * W + B
+    Y_f32 = X_centered * rstd * W_f32 + B_f32
+    # Store output (single conversion back to original dtype)
+    tl.store(row_Y_ptr + col_offsets, Y_f32.to(X_row.dtype), mask=mask)
 @triton.jit
 def _layer_norm_backward_kernel(
     X_ptr,  # pointer to input, shape (n_rows, n_cols)
+    stride_x,  # stride of each row in input
     W_ptr,  # pointer to weights, shape (n_cols,)
     Mean_ptr,  # pointer to mean, shape (n_rows,)
+    stride_mean,  # stride of each row in mean
     RSTD_ptr,  # pointer to rstd, shape (n_rows,)
+    stride_rstd,  # stride of each row in rstd
     DX_ptr,  # pointer to input grad, shape (n_rows, n_cols)
     stride_dx,  # stride of each row in input grad
+    DW_ptr,  # pointer to weights grad, shape (n_cols,)
     stride_dw,  # stride of each row in weights grad
+    DB_ptr,  # pointer to bias grad, shape (n_cols,)
     stride_db,  # stride of each row in bias grad
+    DY_ptr,  # pointer to output grad, shape (n_rows, n_cols)
     stride_dy,  # stride of each row in output grad
     n_rows,
     n_cols,
     rows_per_program: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
 ):
     """
     References:
     https://arxiv.org/abs/1607.06450
     https://github.com/karpathy/llm.c/blob/master/doc/layernorm/layernorm.md
     """
+    row_block_id = tl.program_id(0).to(tl.int64)
     row_start = row_block_id * rows_per_program
     row_end = min((row_block_id + 1) * rows_per_program, n_rows)
     cols = tl.arange(0, BLOCK_SIZE)
     mask = cols < n_cols
+    dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
     db_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+    # Pre-load weights once (same optimization as forward pass)
+    w = tl.load(W_ptr + cols, mask=mask, other=0.0)
+    w_f32 = w.to(tl.float32)
+    for row_idx in range(row_start, row_end):
+        # Calculate pointers for this specific row
+        row_X_ptr = X_ptr + row_idx * stride_x
+        row_DX_ptr = DX_ptr + row_idx * stride_dx
+        row_DY_ptr = DY_ptr + row_idx * stride_dy
+        row_Mean_ptr = Mean_ptr + row_idx * stride_mean
+        row_RSTD_ptr = RSTD_ptr + row_idx * stride_rstd
+        # Load data for this row
+        x = tl.load(row_X_ptr + cols, mask=mask, other=0.0)
+        dy = tl.load(row_DY_ptr + cols, mask=mask, other=0.0)
+        mean = tl.load(row_Mean_ptr)
+        rstd = tl.load(row_RSTD_ptr)
+        # Convert to fp32 for numerical stability
+        x_f32 = x.to(tl.float32)
+        dy_f32 = dy.to(tl.float32)
+        mean_f32 = mean.to(tl.float32)
+        rstd_f32 = rstd.to(tl.float32)
+        # Compute backward pass for this row
+        x_hat = (x_f32 - mean_f32) * rstd_f32
+        wdy = w_f32 * dy_f32
         c1 = tl.sum(x_hat * wdy, axis=0) / n_cols
         c2 = tl.sum(wdy, axis=0) / n_cols
+        dx = (wdy - (x_hat * c1 + c2)) * rstd_f32
+        # Store input gradient
+        tl.store(row_DX_ptr + cols, dx, mask=mask)
+        # Accumulate weight and bias gradients for this thread block's assigned rows
+        dw = dy_f32 * x_hat
+        db = dy_f32
+        dW_row += dw
+        db_row += db
+    tl.store(DW_ptr + row_block_id * stride_dw + cols, dW_row, mask=mask)
+    tl.store(DB_ptr + row_block_id * stride_db + cols, db_row, mask=mask)
 def layer_norm_forward(X, W, B, eps):
+    """
+    Args:
+        X: Input tensor of shape (..., hidden_size)
+        W: Weight tensor of shape (hidden_size,)
+        B: Bias tensor of shape (hidden_size,)
+        eps: Small constant for numerical stability
+    Returns:
+        Tuple of (output, input, mean, rstd, block_size, num_warps)
+    """
     shape = X.shape
     dim = shape[-1]
     X = X.view(-1, dim)
     n_rows, n_cols = X.shape
+    # Calculate optimal block size and warp configuration
     BLOCK_SIZE, num_warps = calculate_settings(n_cols)
+    # Allocate output tensors
     Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
     Mean = torch.empty(n_rows, dtype=X.dtype, device=X.device)
     RSTD = torch.empty(n_rows, dtype=X.dtype, device=X.device)
+    # Validate input dimensions
     if X.shape[1] != W.shape[0]:
         raise ValueError(
             f"Incompatible dimensions: input feature size (X.shape[1]={X.shape[1]}) "
     # XPU-specific optimization
     kernel_args = {}
     if X.device.type == "xpu":
+        set_large_grf_mode(kernel_args)
+    # Launch kernel with one thread block per row for optimal performance
+    grid = (n_rows,)
+    _layer_norm_forward_kernel[grid](
         Y,
         Y.stride(0),
         X,
         eps,
         BLOCK_SIZE=BLOCK_SIZE,
         num_warps=num_warps,
+        **kernel_args,
     )
     return Y.view(*shape), X, Mean, RSTD, BLOCK_SIZE, num_warps
 def layer_norm_backward(dY, X, W, B, Mean, RSTD):
+    """
+    Args:
+        dY: Gradient of output
+        X: Input tensor
+        W: Weight tensor
+        B: Bias tensor
+        Mean: Pre-computed mean
+        RSTD: Pre-computed reciprocal standard deviation
+    Returns:
+        Tuple of (input_grad, weight_grad, bias_grad)
+    """
     shape = dY.shape
     dim = shape[-1]
     dY = dY.view(-1, dim)
         sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
     elif X.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
+    elif X.device.type == "npu":
+        sm_count = get_npu_core_count()
+    # fp32 for numerical stability especially.
+    _DW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
+    _DB = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
+    # Calculate optimal block size and warp configuration
     BLOCK_SIZE, num_warps = calculate_settings(n_cols)
     if n_cols > BLOCK_SIZE:
+        raise RuntimeError(f"Feature dimension {n_cols} exceeds maximum supported size of {BLOCK_SIZE}.")
     rows_per_program = math.ceil(n_rows / sm_count)
     grid = (sm_count,)
+    # Allocate gradient tensors
+    DX = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+    kernel_args = {"num_warps": num_warps}
     # XPU-specific optimization
     if X.device.type == "xpu":
+        kernel_args.update({"num_warps": 32, "num_stages": 4})
+        set_large_grf_mode(kernel_args)
+    # Launch kernel with one thread block per row for optimal performance
     _layer_norm_backward_kernel[grid](
         X,
+        X.stride(0),
         W,
         Mean,
+        Mean.stride(0),
         RSTD,
+        RSTD.stride(0),
         DX,
         DX.stride(0),
+        _DW,
         _DW.stride(0),
+        _DB,
         _DB.stride(0),
+        dY,
         dY.stride(0),
         n_rows,
         n_cols,
+        rows_per_program=rows_per_program,
         BLOCK_SIZE=BLOCK_SIZE,
+        **kernel_args,
     )
+    DX = DX.view(*shape)
     DW = _DW.sum(dim=0).to(W.dtype)
+    DB = _DB.sum(dim=0).to(B.dtype)
     return DX, DW, DB
     def backward(ctx, dY):
         X, W, B, Mean, RSTD = ctx.saved_tensors
         DX, DW, DB = layer_norm_backward(dY, X, W, B, Mean, RSTD)
+        return DX, DW, DB, None

build/torch-cuda/layers.py CHANGED Viewed

@@ -1,39 +1,463 @@
 import torch
 from .rms_norm import LigerRMSNormFunction
-class LigerRMSNorm(torch.nn.Module):
-    """
-    RMSNorm module that uses the optimized LigerRMSNormFunction.
-    Args:
-        hidden_size (int): The size of the hidden dimension.
-        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
-        offset (float, optional): Offset value to shift the weight tensor. Defaults to 0.0.
-        casting_mode (str, optional): The casting mode to use. Defaults to "llama".
-        in_place (bool, optional): Whether to modify dY in-place to store dX during backward. Defaults to True.
-    """
-    weight: torch.Tensor
-    variance_epsilon: float
-    def forward(self, hidden_states):
-        """
-        Apply RMS normalization to the input tensor.
-        Args:
-            hidden_states (torch.Tensor): Input tensor of shape (B, T, H) or (BxT, H)
-        Returns:
-            torch.Tensor: Normalized tensor of the same shape as input
-        """
         return LigerRMSNormFunction.apply(
-            hidden_states,
-            self.weight,
             self.variance_epsilon,
-            0,
-            "llama",
-            True
         )
-__all__ = ["LigerRMSNorm"]

+import inspect
+from dataclasses import dataclass
+from typing import Optional, Tuple
 import torch
+import torch.nn as nn
+from .cross_entropy import LigerCrossEntropyFunction
+from .dyt import LigerDyTFunction
+from .fused_linear_cross_entropy import LigerFusedLinearCrossEntropyFunction
+from .geglu import LigerGELUMulFunction
+from .group_norm import LigerGroupNormFunction
+from .jsd import LigerJSDFunction
+from .kl_div import LigerKLDivLossFunction
+from .layer_norm import LigerLayerNormFunction
+from .qwen2vl_mrope import LigerQwen2VLMRopeFunction
 from .rms_norm import LigerRMSNormFunction
+from .rope import LigerRopeFunction
+from .swiglu import LigerSiLUMulFunction
+from .tvd import LigerTVDLossFunction
+class LigerRMSNorm(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+        offset: float = 0.0,
+        casting_mode: str = "llama",
+        init_fn: str = "ones",
+        in_place: bool = True,
+        row_mode: Optional[bool] = None,
+        elementwise_affine: bool = True,
+    ):
+        super().__init__()
+        assert init_fn in ("ones", "zeros"), f"init_fn must be 'ones' or 'zeros', got {init_fn}"
+        self.hidden_size = hidden_size
+        self.variance_epsilon = eps
+        self.offset = offset
+        self.casting_mode = casting_mode
+        self.in_place = in_place
+        self.row_mode = row_mode
+        self.elementwise_affine = elementwise_affine
+        if elementwise_affine:
+            init = torch.ones(hidden_size) if init_fn == "ones" else torch.zeros(hidden_size)
+            self.weight = nn.Parameter(init)
+        else:
+            self.register_parameter("weight", None)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return LigerRMSNormFunction.apply(
+            hidden_states,
+            self.weight,
+            self.variance_epsilon,
+            self.offset,
+            self.casting_mode,
+            self.in_place,
+            self.row_mode,
+        )
+    def extra_repr(self) -> str:
+        return (
+            f"{self.hidden_size}, eps={self.variance_epsilon}, offset={self.offset}, "
+            f"in_place={self.in_place}, row_mode={self.row_mode}"
+        )
+class LigerLayerNorm(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+        bias: bool = False,
+        init_fn: str = "ones",
+    ):
+        super().__init__()
+        assert init_fn in ("ones", "zeros"), f"init_fn must be 'ones' or 'zeros', got {init_fn}"
+        self.hidden_size = hidden_size
+        self.variance_epsilon = eps
+        self.weight = nn.Parameter(torch.ones(hidden_size) if init_fn == "ones" else torch.zeros(hidden_size))
+        self.bias = nn.Parameter(torch.randn(hidden_size) if bias else torch.zeros(hidden_size))
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return LigerLayerNormFunction.apply(hidden_states, self.weight, self.bias, self.variance_epsilon)
+    def extra_repr(self) -> str:
+        return f"{self.hidden_size}, eps={self.variance_epsilon}"
+class LigerGroupNorm(nn.Module):
+    def __init__(
+        self,
+        num_channels: int,
+        num_groups: int,
+        eps: float = 1e-6,
+        bias: bool = False,
+        init_fn: str = "ones",
+    ):
+        super().__init__()
+        assert init_fn in ("ones", "zeros"), f"init_fn must be 'ones' or 'zeros', got {init_fn}"
+        assert num_channels % num_groups == 0, (
+            f"num_channels ({num_channels}) must be divisible by num_groups ({num_groups})"
+        )
+        self.num_channels = num_channels
+        self.num_groups = num_groups
+        self.variance_epsilon = eps
+        self.weight = nn.Parameter(torch.ones(num_channels) if init_fn == "ones" else torch.zeros(num_channels))
+        self.bias = nn.Parameter(torch.randn(num_channels) if bias else torch.zeros(num_channels))
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        assert hidden_states.dim() >= 3, f"Input must have at least 3 dimensions, got {hidden_states.dim()}"
+        assert hidden_states.size(1) == self.num_channels, (
+            f"Input must have {self.num_channels} channels, got {hidden_states.size(1)}"
+        )
+        return LigerGroupNormFunction.apply(
+            hidden_states,
+            self.weight,
+            self.bias,
+            self.num_channels,
+            self.num_groups,
             self.variance_epsilon,
         )
+    def extra_repr(self) -> str:
+        return f"num_channels={self.num_channels}, num_groups={self.num_groups}, eps={self.variance_epsilon}"
+class LigerDyT(nn.Module):
+    def __init__(self, hidden_size: int, beta: bool = True, init_alpha: float = 0.5):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.init_alpha = init_alpha
+        self.alpha = nn.Parameter(torch.ones(1) * init_alpha)
+        self.gamma = nn.Parameter(torch.ones(hidden_size))
+        self.beta = nn.Parameter(torch.zeros(hidden_size)) if beta else None
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return LigerDyTFunction.apply(x, self.alpha, self.gamma, self.beta)
+    def extra_repr(self) -> str:
+        return f"{self.hidden_size}, init_alpha={self.init_alpha}, beta={self.beta is not None}"
+class LigerCrossEntropyLoss(nn.Module):
+    def __init__(
+        self,
+        weight: Optional[torch.Tensor] = None,
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
+    ):
+        super().__init__()
+        assert 0.0 <= label_smoothing <= 1.0, f"label_smoothing must be in [0, 1], got {label_smoothing}"
+        assert reduction in ("mean", "sum", "none"), f"reduction must be 'mean', 'sum', or 'none', got {reduction}"
+        assert softcap is None or softcap > 0, f"softcap must be > 0 or None, got {softcap}"
+        self.weight = weight
+        self.ignore_index = ignore_index
+        self.lse_square_scale = lse_square_scale
+        self.label_smoothing = label_smoothing
+        self.reduction = reduction
+        self.softcap = softcap
+    def forward(self, _input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        loss, _, _, _ = LigerCrossEntropyFunction.apply(
+            _input,
+            target,
+            self.weight,
+            self.ignore_index,
+            self.lse_square_scale,
+            self.label_smoothing,
+            self.reduction,
+            self.softcap,
+            False,
+            False,
+            False,
+        )
+        return loss
+class LigerFusedLinearCrossEntropyLoss(nn.Module):
+    def __init__(
+        self,
+        ce_weight: Optional[torch.Tensor] = None,
+        ignore_index: int = -100,
+        lse_square_scale: float = 0.0,
+        label_smoothing: float = 0.0,
+        reduction: str = "mean",
+        softcap: Optional[float] = None,
+        accum_dtype: Optional[torch.dtype] = None,
+        use_token_scaling: bool = False,
+    ):
+        super().__init__()
+        assert 0.0 <= label_smoothing <= 1.0, f"label_smoothing must be in [0, 1], got {label_smoothing}"
+        assert reduction in ("mean", "sum", "none"), f"reduction must be 'mean', 'sum', or 'none', got {reduction}"
+        assert softcap is None or softcap > 0, f"softcap must be > 0 or None, got {softcap}"
+        self.ce_weight = ce_weight
+        self.ignore_index = ignore_index
+        self.lse_square_scale = lse_square_scale
+        self.label_smoothing = label_smoothing
+        self.reduction = reduction
+        self.softcap = softcap
+        self.accum_dtype = accum_dtype
+        self.use_token_scaling = use_token_scaling
+    def forward(
+        self,
+        lin_weight: torch.Tensor,
+        _input: torch.Tensor,
+        target: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        loss, _, _, _ = LigerFusedLinearCrossEntropyFunction.apply(
+            _input,
+            lin_weight,
+            target,
+            bias,
+            self.ce_weight,
+            self.ignore_index,
+            self.lse_square_scale,
+            self.label_smoothing,
+            self.reduction,
+            self.softcap,
+            False,
+            self.accum_dtype,
+            self.use_token_scaling,
+            False,
+            False,
+        )
+        return loss
+class LigerJSD(nn.Module):
+    def __init__(self, beta: float = 0.5, ignore_index: int = -100):
+        super().__init__()
+        self.beta = beta
+        self.ignore_index = ignore_index
+    def forward(
+        self,
+        log_q: torch.Tensor,
+        log_p: torch.Tensor,
+        shift_labels: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return LigerJSDFunction.apply(log_q, log_p, shift_labels, self.beta, self.ignore_index)
+class LigerKLDIVLoss(nn.KLDivLoss):
+    def __init__(self, eps: float = 1e-10, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.eps = eps
+    def forward(self, y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
+        return LigerKLDivLossFunction.apply(y_pred, y_true, self.reduction, self.log_target, self.eps)
+class LigerTVDLoss(nn.Module):
+    def __init__(self, reduction: str = "batchmean", ignore_index: int = -100):
+        super().__init__()
+        self.reduction = reduction
+        self.ignore_index = ignore_index
+    def forward(
+        self,
+        p: torch.Tensor,
+        q: torch.Tensor,
+        shift_labels: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return LigerTVDLossFunction.apply(p, q, shift_labels, self.reduction, self.ignore_index)
+class LigerSwiGLUMLP(nn.Module):
+    """SwiGLU MLP block. ``config`` must expose ``hidden_size``, ``intermediate_size``,
+    and ``hidden_act`` (must be ``silu`` or ``swish``)."""
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_act not in ("silu", "swish"):
+            raise ValueError(f"Activation function {config.hidden_act} not supported.")
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(LigerSiLUMulFunction.apply(self.gate_proj(x), self.up_proj(x)))
+class LigerGEGLUMLP(nn.Module):
+    """GEGLU MLP block. ``config`` must expose ``hidden_size`` and ``intermediate_size``.
+    Uses the tanh approximation of GELU (matches Gemma 1/1.1/2)."""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(LigerGELUMulFunction.apply(self.gate_proj(x), self.up_proj(x)))
+@dataclass
+class CrossEntropyOutput:
+    loss: torch.Tensor
+    z_loss: Optional[torch.Tensor] = None
+    token_accuracy: Optional[torch.Tensor] = None
+    predicted_tokens: Optional[torch.Tensor] = None
+def liger_fused_linear_cross_entropy(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    target: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    ce_weight: Optional[torch.Tensor] = None,
+    ignore_index: int = -100,
+    lse_square_scale: float = 0.0,
+    label_smoothing: float = 0.0,
+    reduction: str = "mean",
+    softcap: Optional[float] = None,
+    return_z_loss: bool = False,
+    accum_dtype: Optional[torch.dtype] = None,
+    use_token_scaling: bool = False,
+    return_token_accuracy: bool = False,
+    return_predicted_tokens: bool = False,
+):
+    loss, z_loss, token_accuracy, predicted_tokens = LigerFusedLinearCrossEntropyFunction.apply(
+        input,
+        weight,
+        target,
+        bias,
+        ce_weight,
+        ignore_index,
+        lse_square_scale,
+        label_smoothing,
+        reduction,
+        softcap,
+        return_z_loss,
+        accum_dtype,
+        use_token_scaling,
+        return_token_accuracy,
+        return_predicted_tokens,
+    )
+    if not return_z_loss and not return_token_accuracy and not return_predicted_tokens:
+        return loss
+    return CrossEntropyOutput(
+        loss=loss,
+        z_loss=z_loss,
+        token_accuracy=token_accuracy,
+        predicted_tokens=predicted_tokens,
+    )
+def LigerForCausalLMLoss(
+    hidden_states: torch.Tensor,
+    lm_head_weight: torch.Tensor,
+    labels: torch.Tensor,
+    hidden_size: int,
+    num_items_in_batch: Optional[int] = None,
+    ignore_index: int = -100,
+    shift_labels: Optional[torch.Tensor] = None,
+    final_logit_softcapping: Optional[float] = None,
+    return_token_accuracy: bool = False,
+    return_predicted_tokens: bool = False,
+    **kwargs,
+):
+    """Drop-in replacement for ``transformers.loss.ForCausalLMLoss`` that fuses the
+    final ``lm_head`` projection with the cross-entropy loss. Returns a scalar
+    ``loss`` by default; returns a :class:`CrossEntropyOutput` when
+    ``return_token_accuracy`` or ``return_predicted_tokens`` is set."""
+    applicable_params = inspect.signature(liger_fused_linear_cross_entropy).parameters
+    kwargs = {k: v for k, v in kwargs.items() if k in applicable_params}
+    if shift_labels is None:
+        labels = nn.functional.pad(labels, (0, 1), value=ignore_index)
+        shift_labels = labels[..., 1:].contiguous()
+    hidden_states = hidden_states.view(-1, hidden_size)
+    shift_labels = shift_labels.view(-1).to(hidden_states.device)
+    reduction = "sum" if num_items_in_batch is not None else "mean"
+    result = liger_fused_linear_cross_entropy(
+        hidden_states,
+        lm_head_weight,
+        shift_labels,
+        reduction=reduction,
+        ignore_index=ignore_index,
+        softcap=final_logit_softcapping,
+        return_token_accuracy=return_token_accuracy,
+        return_predicted_tokens=return_predicted_tokens,
+        **kwargs,
+    )
+    if isinstance(result, CrossEntropyOutput):
+        loss = result.loss
+        token_accuracy = result.token_accuracy
+        predicted_tokens = result.predicted_tokens
+    else:
+        loss = result
+        token_accuracy = None
+        predicted_tokens = None
+    if reduction == "sum":
+        loss = loss / num_items_in_batch
+    if return_token_accuracy or return_predicted_tokens:
+        return CrossEntropyOutput(
+            loss=loss,
+            token_accuracy=token_accuracy,
+            predicted_tokens=predicted_tokens,
+        )
+    return loss
+def liger_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    position_ids: Optional[torch.Tensor] = None,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Apply standard rotary positional embedding to ``q`` and ``k``."""
+    return LigerRopeFunction.apply(q, k, cos, sin, position_ids, unsqueeze_dim)
+def liger_multimodal_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    mrope_section,
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Apply Qwen2-VL multimodal rotary positional embedding (M-RoPE) to ``q`` and ``k``."""
+    return LigerQwen2VLMRopeFunction.apply(q, k, cos, sin, mrope_section, unsqueeze_dim)
+__all__ = [
+    "LigerRMSNorm",
+    "LigerLayerNorm",
+    "LigerGroupNorm",
+    "LigerDyT",
+    "LigerCrossEntropyLoss",
+    "LigerFusedLinearCrossEntropyLoss",
+    "LigerJSD",
+    "LigerKLDIVLoss",
+    "LigerTVDLoss",
+    "LigerSwiGLUMLP",
+    "LigerGEGLUMLP",
+    "CrossEntropyOutput",
+    "liger_fused_linear_cross_entropy",
+    "LigerForCausalLMLoss",
+    "liger_rotary_pos_emb",
+    "liger_multimodal_rotary_pos_emb",
+]

build/torch-cuda/metadata.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "liger-kernels",
-  "id": "_liger_kernels_cuda_e29f7ec",
   "version": 1,
   "license": "BSD-2-Clause",
   "python-depends": [],

 {
   "name": "liger-kernels",
+  "id": "_liger_kernels_cuda_08b4d53",
   "version": 1,
   "license": "BSD-2-Clause",
   "python-depends": [],

build/torch-cuda/qwen2vl_mrope.py CHANGED Viewed

@@ -219,4 +219,4 @@ class LigerQwen2VLMRopeFunction(torch.autograd.Function):
         cos, sin = ctx.saved_tensors
         mrope_section = ctx.mrope_section
         dq, dk = qwen2vl_mrope_backward(dq, dk, cos, sin, mrope_section)
-        return dq, dk, None, None, None, None

         cos, sin = ctx.saved_tensors
         mrope_section = ctx.mrope_section
         dq, dk = qwen2vl_mrope_backward(dq, dk, cos, sin, mrope_section)
+        return dq, dk, None, None, None, None

build/torch-cuda/rms_norm.py CHANGED Viewed

@@ -20,9 +20,12 @@ import triton.language as tl
 from .utils import calculate_settings
 from .utils import compare_version
 from .utils import ensure_contiguous
 from .utils import torch_to_triton_dtype
-if compare_version("triton", operator.ge, "3.0.0"):
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
@@ -52,6 +55,7 @@ def _rms_norm_forward_kernel(
     eps,
     offset,
     casting_mode: tl.constexpr,  # constexpr so the `if` blocks can be optimized out
     BLOCK_SIZE: tl.constexpr,
 ):
     """
@@ -63,17 +67,18 @@ def _rms_norm_forward_kernel(
     3. https://arxiv.org/pdf/1910.07467
     """
-    row_idx = tl.program_id(0)
     col_offsets = tl.arange(0, BLOCK_SIZE)
     mask = col_offsets < n_cols
-    Y_ptr += row_idx * Y_row_stride
-    X_ptr += row_idx * X_row_stride
-    RSTD_ptr += row_idx * RSTD_row_stride
-    X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
     X_row_dtype = X_row.dtype
-    W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
     # On Llama, only rstd is computed on fp32
     if casting_mode == _CASTING_MODE_LLAMA:
@@ -81,7 +86,8 @@ def _rms_norm_forward_kernel(
     # Gemma computes everything on fp32, and then casts back the output to the original dtype
     if casting_mode == _CASTING_MODE_GEMMA:
-        W_row = W_row.to(tl.float32)
         X_row = X_row.to(tl.float32)
     if casting_mode == _CASTING_MODE_NONE:
@@ -94,7 +100,7 @@ def _rms_norm_forward_kernel(
     # We can save time by caching rms with minimal memory overhead
     # because rms is much smaller compared to X_row, as rms is for each row.
     # However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
-    tl.store(RSTD_ptr, rstd)
     X_row = X_row * rstd
@@ -102,12 +108,15 @@ def _rms_norm_forward_kernel(
     if casting_mode == _CASTING_MODE_LLAMA:
         X_row = X_row.to(X_row_dtype)
-    Y_row = X_row * (offset + W_row)
     if casting_mode == _CASTING_MODE_GEMMA:
         Y_row = Y_row.to(X_row_dtype)
-    tl.store(Y_ptr + col_offsets, Y_row, mask=mask)
 @triton.jit
@@ -128,8 +137,9 @@ def _rms_norm_backward_kernel(
     n_rows,
     n_cols,
     offset,
-    rows_per_program: tl.constexpr,
     casting_mode: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
 ):
     """
@@ -137,61 +147,256 @@ def _rms_norm_backward_kernel(
     dw = sum(dy * (x / RMS)). summation over BxT dimension
     """
-    row_block_id = tl.program_id(0)
     row_start = row_block_id * rows_per_program
     row_end = min((row_block_id + 1) * rows_per_program, n_rows)
     col_offsets = tl.arange(0, BLOCK_SIZE)
     mask = col_offsets < n_cols
-    dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
-    dY_ptr += row_start * dY_row_stride
-    dX_ptr += row_start * dX_row_stride
-    X_ptr += row_start * X_row_stride
-    RSTD_ptr += row_start
-    W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
-    W_row = W_row + offset
-    for _ in range(row_start, row_end):
-        dY_row = tl.load(dY_ptr + col_offsets, mask=mask, other=0.0)
-        X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0.0)
         # Get cached rms
-        rstd_row = tl.load(RSTD_ptr)
         X_row = X_row.to(tl.float32)
         # Different bacward graphs for different casting modes
         if casting_mode == _CASTING_MODE_LLAMA:
-            m = (dY_row * W_row).to(tl.float32)
         elif casting_mode == _CASTING_MODE_GEMMA:
             dY_row = dY_row.to(tl.float32)
-            m = dY_row * W_row
         else:
-            m = dY_row * W_row
         dX_row = rstd_row * m
         dX_row += (rstd_row) * (-(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row)
-        # calculate the gradient of W
         if casting_mode == _CASTING_MODE_LLAMA:
-            dW_row += dY_row * (X_row * rstd_row).to(X_dtype)
         else:
-            # here X_row is already in fp32 (see previous if block)
-            dW_row += dY_row * (X_row * rstd_row)
-        tl.store(dX_ptr + col_offsets, dX_row.to(X_dtype), mask=mask)
-        dY_ptr += dY_row_stride
-        dX_ptr += dX_row_stride
-        X_ptr += X_row_stride
-        RSTD_ptr += RSTD_row_stride
-    tl.store(dW_ptr + row_block_id * dW_row_stride + col_offsets, dW_row, mask=mask)
 _str_to_casting_mode = {
@@ -201,7 +406,7 @@ _str_to_casting_mode = {
 }
-def rms_norm_forward(X, W, eps, offset, casting_mode):
     if not isinstance(casting_mode, int):
         assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
         casting_mode = _str_to_casting_mode[casting_mode]
@@ -220,34 +425,64 @@ def rms_norm_forward(X, W, eps, offset, casting_mode):
     rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
     RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
-    # Check constraints.
-    assert X.shape[1] == W.shape[0], "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
     # XPU-specific optimization
     kernel_args = {}
     if X.device.type == "xpu":
-        kernel_args["grf_mode"] = "large"
-    _rms_norm_forward_kernel[(n_rows,)](
-        Y,
-        Y.stride(0),
-        X,
-        X.stride(0),
-        W,
-        W.stride(0),
-        RSTD,
-        RSTD.stride(0),
-        n_cols,
-        eps,
-        offset,
-        casting_mode,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
-        **kernel_args,  # XPU-specific optimization
-    )
     return Y.view(*shape), X, RSTD, BLOCK_SIZE, num_warps, casting_mode
-def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place):
     shape = dY.shape
     dim = shape[-1]
     dY = dY.view(-1, dim)
@@ -258,9 +493,16 @@ def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warp
         sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
     elif X.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
-    # fp32 for numerical stability especially.
-    _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
     if n_cols > BLOCK_SIZE:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
@@ -275,33 +517,65 @@ def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warp
     # XPU-specific optimization
     kernel_args = {}
     if X.device.type == "xpu":
-        kernel_args["grf_mode"] = "large"
-    _rms_norm_backward_kernel[grid](
-        dY,
-        dY.stride(0),
-        dX,
-        dX.stride(0),
-        X,
-        X.stride(0),
-        torch_to_triton_dtype[X.dtype],
-        W,
-        W.stride(0),
-        RSTD,
-        RSTD.stride(0),
-        _dW,
-        _dW.stride(0),
-        n_rows,
-        n_cols,
-        offset,
-        rows_per_program,
-        casting_mode,
-        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps,
-        **kernel_args,  # XPU-specific optimization
-    )
     dX = dX.view(*shape)
-    dW = _dW.sum(dim=0).to(W.dtype)
     return dX, dW
@@ -330,18 +604,30 @@ class LigerRMSNormFunction(torch.autograd.Function):
     @staticmethod
     @ensure_contiguous
-    def forward(ctx, X, W, eps, offset=0.0, casting_mode="llama", in_place=True):
         """
         X: (B, T, H) or (BxT, H)
         W: (H,)
         """
-        Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(X, W, eps, offset, casting_mode)
         ctx.offset = offset
         ctx.casting_mode = casting_mode
         ctx.in_place = in_place
         ctx.BLOCK_SIZE = BLOCK_SIZE
         ctx.num_warps = num_warps
-        ctx.save_for_backward(X, W, RSTD)
         return Y
     @staticmethod
@@ -350,16 +636,19 @@ class LigerRMSNormFunction(torch.autograd.Function):
         """
         Y: (B, T, H) or (BxT, H)
         """
-        X, W, RSTD = ctx.saved_tensors
         dX, dW = rms_norm_backward(
-            dY,
-            X,
-            W,
-            RSTD,
-            ctx.offset,
-            ctx.casting_mode,
-            ctx.BLOCK_SIZE,
-            ctx.num_warps,
-            ctx.in_place,
         )
-        return dX, dW, None, None, None, None

 from .utils import calculate_settings
 from .utils import compare_version
 from .utils import ensure_contiguous
+from .utils import get_npu_core_count
+from .utils import set_large_grf_mode
 from .utils import torch_to_triton_dtype
+from .utils import is_npu_available
+if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
     try:
         # typical import path with dispatch available
         from triton.language.extra.libdevice import rsqrt
     eps,
     offset,
     casting_mode: tl.constexpr,  # constexpr so the `if` blocks can be optimized out
+    elementwise_affine: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
 ):
     """
     3. https://arxiv.org/pdf/1910.07467
     """
+    row_idx = tl.program_id(0).to(tl.int64)
     col_offsets = tl.arange(0, BLOCK_SIZE)
     mask = col_offsets < n_cols
+    y_base = Y_ptr + row_idx * Y_row_stride
+    x_base = X_ptr + row_idx * X_row_stride
+    rstd_base = RSTD_ptr + row_idx * RSTD_row_stride
+    X_row = tl.load(x_base + col_offsets, mask=mask, other=0)
     X_row_dtype = X_row.dtype
+    if elementwise_affine:
+        W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
     # On Llama, only rstd is computed on fp32
     if casting_mode == _CASTING_MODE_LLAMA:
     # Gemma computes everything on fp32, and then casts back the output to the original dtype
     if casting_mode == _CASTING_MODE_GEMMA:
+        if elementwise_affine:
+            W_row = W_row.to(tl.float32)
         X_row = X_row.to(tl.float32)
     if casting_mode == _CASTING_MODE_NONE:
     # We can save time by caching rms with minimal memory overhead
     # because rms is much smaller compared to X_row, as rms is for each row.
     # However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
+    tl.store(rstd_base, rstd)
     X_row = X_row * rstd
     if casting_mode == _CASTING_MODE_LLAMA:
         X_row = X_row.to(X_row_dtype)
+    if elementwise_affine:
+        Y_row = X_row * (offset + W_row)
+    else:
+        Y_row = X_row
     if casting_mode == _CASTING_MODE_GEMMA:
         Y_row = Y_row.to(X_row_dtype)
+    tl.store(y_base + col_offsets, Y_row, mask=mask)
 @triton.jit
     n_rows,
     n_cols,
     offset,
+    rows_per_program,
     casting_mode: tl.constexpr,
+    elementwise_affine: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
 ):
     """
     dw = sum(dy * (x / RMS)). summation over BxT dimension
     """
+    row_block_id = tl.program_id(0).to(tl.int64)
     row_start = row_block_id * rows_per_program
     row_end = min((row_block_id + 1) * rows_per_program, n_rows)
     col_offsets = tl.arange(0, BLOCK_SIZE)
     mask = col_offsets < n_cols
+    if elementwise_affine:
+        dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+    if elementwise_affine:
+        W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
+        W_row = W_row + offset
+    for row_idx in range(row_start, row_end):
+        dy_base = dY_ptr + row_idx * dY_row_stride
+        dx_base = dX_ptr + row_idx * dX_row_stride
+        x_base = X_ptr + row_idx * X_row_stride
+        rstd_base = RSTD_ptr + row_idx * RSTD_row_stride
+        dY_row = tl.load(dy_base + col_offsets, mask=mask, other=0.0)
+        X_row = tl.load(x_base + col_offsets, mask=mask, other=0.0)
         # Get cached rms
+        rstd_row = tl.load(rstd_base)
         X_row = X_row.to(tl.float32)
         # Different bacward graphs for different casting modes
         if casting_mode == _CASTING_MODE_LLAMA:
+            if elementwise_affine:
+                m = (dY_row * W_row).to(tl.float32)
+            else:
+                m = dY_row.to(tl.float32)
         elif casting_mode == _CASTING_MODE_GEMMA:
             dY_row = dY_row.to(tl.float32)
+            if elementwise_affine:
+                m = dY_row * W_row
+            else:
+                m = dY_row
         else:
+            if elementwise_affine:
+                m = dY_row * W_row
+            else:
+                m = dY_row
         dX_row = rstd_row * m
         dX_row += (rstd_row) * (-(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row)
+        if elementwise_affine:
+            # calculate the gradient of W
+            if casting_mode == _CASTING_MODE_LLAMA:
+                dW_row += dY_row * (X_row * rstd_row).to(X_dtype)
+            else:
+                # here X_row is already in fp32 (see previous if block)
+                dW_row += dY_row * (X_row * rstd_row)
+        tl.store(dx_base + col_offsets, dX_row.to(X_dtype), mask=mask)
+    if elementwise_affine:
+        tl.store(dW_ptr + row_block_id * dW_row_stride + col_offsets, dW_row, mask=mask)
+@triton.jit
+def _block_rms_norm_forward_kernel(
+    Y_ptr,
+    Y_row_stride,
+    X_ptr,
+    X_row_stride,
+    W_ptr,
+    W_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    n_rows,
+    n_cols,
+    eps,
+    offset,
+    casting_mode: tl.constexpr,  # constexpr so the `if` blocks can be optimized out
+    elementwise_affine: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    BLOCK_ROW: tl.constexpr,
+):
+    """
+    y_i = (x_i / (RMS)) * (offset + wi), RMS = sqrt(sum(x_i^2) / N)
+    Reference:
+    1. https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+    2. https://github.com/unslothai/unsloth/blob/fd753fed99ed5f10ef8a9b7139588d9de9ddecfb/unsloth/kernels/rms_layernorm.py#L22
+    3. https://arxiv.org/pdf/1910.07467
+    """
+    row_idx = tl.program_id(0) * BLOCK_ROW + tl.arange(0, BLOCK_ROW)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    row_mask = row_idx < n_rows
+    col_mask = col_offsets < n_cols
+    X_row = tl.load(
+        X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :],
+        mask=row_mask[:, None] & col_mask[None, :],
+        other=0,
+    )
+    X_row_dtype = X_row.dtype
+    if elementwise_affine:
+        W_row = tl.load(W_ptr + col_offsets, mask=col_mask, other=0)
+    # On Llama, only rstd is computed on fp32
+    if casting_mode == _CASTING_MODE_LLAMA:
+        X_row = X_row.to(tl.float32)
+    # Gemma computes everything on fp32, and then casts back the output to the original dtype
+    if casting_mode == _CASTING_MODE_GEMMA:
+        if elementwise_affine:
+            W_row = W_row.to(tl.float32)
+        X_row = X_row.to(tl.float32)
+    if casting_mode == _CASTING_MODE_NONE:
+        eps = eps.to(X_row_dtype)
+        offset = offset.to(X_row_dtype)
+    mean_square = tl.sum(X_row * X_row, axis=1) / n_cols
+    rstd = rsqrt(mean_square + eps)
+    # We can save time by caching rms with minimal memory overhead
+    # because rms is much smaller compared to X_row, as rms is for each row.
+    # However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
+    tl.store(RSTD_ptr + row_idx * RSTD_row_stride, rstd, row_mask)
+    X_row = X_row * rstd[:, None]
+    # On Llama, the multiplication with the weight is done on the original dtype
+    if casting_mode == _CASTING_MODE_LLAMA:
+        X_row = X_row.to(X_row_dtype)
+    if elementwise_affine:
+        Y_row = X_row * (offset + W_row)[None, :]
+    else:
+        Y_row = X_row
+    if casting_mode == _CASTING_MODE_GEMMA:
+        Y_row = Y_row.to(X_row_dtype)
+    tl.store(
+        Y_ptr + row_idx[:, None] * Y_row_stride + col_offsets[None, :],
+        Y_row,
+        mask=row_mask[:, None] & col_mask[None, :],
+    )
+@triton.jit
+def _block_rms_norm_backward_kernel(
+    dY_ptr,
+    dY_row_stride,
+    dX_ptr,
+    dX_row_stride,
+    X_ptr,
+    X_row_stride,
+    X_dtype: tl.constexpr,
+    W_ptr,
+    W_row_stride,
+    RSTD_ptr,
+    RSTD_row_stride,
+    dW_ptr,
+    dW_row_stride,
+    n_rows,
+    n_cols,
+    offset,
+    casting_mode: tl.constexpr,
+    elementwise_affine: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    BLOCK_ROW: tl.constexpr,
+):
+    """
+    dx = (1 / RMS) * [dy * (w + offset - (1 / N) * (1 / RMS^2) * ((dy * (w + offset)) dot x) * x]. * means element-wise multiplication, whileas dot means dot product
+    dw = sum(dy * (x / RMS)). summation over BxT dimension
+    """
+    pid = tl.program_id(0).cast(tl.int64)
+    NUM_SMS = tl.num_programs(0)
+    col_offsets = tl.arange(0, BLOCK_SIZE)
+    col_mask = col_offsets < n_cols
+    if elementwise_affine:
+        dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+        W_row = tl.load(W_ptr + col_offsets, mask=col_mask, other=0.0)
+        W_row = W_row + offset
+    for start in range(pid * BLOCK_ROW, n_rows, NUM_SMS * BLOCK_ROW):
+        row_idx = start + tl.arange(0, BLOCK_ROW)
+        row_mask = row_idx < n_rows
+        dY_row = tl.load(
+            dY_ptr + row_idx[:, None] * dY_row_stride + col_offsets[None, :],
+            mask=row_mask[:, None] & col_mask[None, :],
+            other=0.0,
+        )
+        X_row = tl.load(
+            X_ptr + row_idx[:, None] * X_row_stride + col_offsets[None, :],
+            mask=row_mask[:, None] & col_mask[None, :],
+            other=0.0,
+        )
+        # Get cached rms
+        rstd_row = tl.load(RSTD_ptr + row_idx * RSTD_row_stride, row_mask)
+        X_row = X_row.to(tl.float32)
+        # Different bacward graphs for different casting modes
         if casting_mode == _CASTING_MODE_LLAMA:
+            if elementwise_affine:
+                m = (dY_row * W_row[None, :]).to(tl.float32)
+            else:
+                m = dY_row.to(tl.float32)
+        elif casting_mode == _CASTING_MODE_GEMMA:
+            dY_row = dY_row.to(tl.float32)
+            if elementwise_affine:
+                m = dY_row * W_row[None, :]
+            else:
+                m = dY_row
         else:
+            if elementwise_affine:
+                m = dY_row * W_row[None, :]
+            else:
+                m = dY_row
+        dX_row = rstd_row[:, None] * m
+        dX_row += (rstd_row[:, None]) * (
+            -(1 / n_cols) * (rstd_row * rstd_row * tl.sum(m * X_row, axis=1))[:, None] * X_row
+        )
+        if elementwise_affine:
+            if casting_mode == _CASTING_MODE_LLAMA:
+                # TODO(tcc): use tl.sum(..., dtype=tl.float32) once we upgrade to triton>=3.3.0
+                dW_row += tl.sum((dY_row * (X_row * rstd_row[:, None]).to(X_dtype)).to(tl.float32), 0)
+            else:
+                # here X_row is already in fp32 (see previous if block)
+                dW_row += tl.sum(dY_row * (X_row * rstd_row[:, None]), 0)
+        tl.store(
+            dX_ptr + row_idx[:, None] * dX_row_stride + col_offsets[None, :],
+            dX_row,
+            mask=row_mask[:, None] & col_mask[None, :],
+        )
+    if elementwise_affine:
+        tl.store(dW_ptr + pid * dW_row_stride + col_offsets, dW_row, mask=col_mask)
 _str_to_casting_mode = {
 }
+def rms_norm_forward(X, W, eps, offset, casting_mode, row_mode):
     if not isinstance(casting_mode, int):
         assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
         casting_mode = _str_to_casting_mode[casting_mode]
     rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
     RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
+    if W is not None:
+        # Check constraints.
+        assert X.shape[1] == W.shape[0], (
+            "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
+        )
+        elementwise_affine = True
+    else:
+        elementwise_affine = False
     # XPU-specific optimization
     kernel_args = {}
     if X.device.type == "xpu":
+        set_large_grf_mode(kernel_args)
+    if BLOCK_SIZE > 256 or n_rows < 4096 * 8 or row_mode:
+        _rms_norm_forward_kernel[(n_rows,)](
+            Y,
+            Y.stride(0),
+            X,
+            X.stride(0),
+            W,
+            W.stride(0) if elementwise_affine else 0,
+            RSTD,
+            RSTD.stride(0),
+            n_cols,
+            eps,
+            offset,
+            casting_mode,
+            elementwise_affine=elementwise_affine,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+            **kernel_args,  # XPU-specific optimization
+        )
+    else:
+        BLOCK_ROW = 16
+        kernel_args["BLOCK_ROW"] = BLOCK_ROW
+        _block_rms_norm_forward_kernel[(triton.cdiv(n_rows, BLOCK_ROW),)](
+            Y,
+            Y.stride(0),
+            X,
+            X.stride(0),
+            W,
+            W.stride(0) if elementwise_affine else 0,
+            RSTD,
+            RSTD.stride(0),
+            n_rows,
+            n_cols,
+            eps,
+            offset,
+            casting_mode,
+            elementwise_affine=elementwise_affine,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+            **kernel_args,  # XPU-specific optimization
+        )
     return Y.view(*shape), X, RSTD, BLOCK_SIZE, num_warps, casting_mode
+def rms_norm_backward(dY, X, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place, row_mode):
     shape = dY.shape
     dim = shape[-1]
     dY = dY.view(-1, dim)
         sm_count = torch.cuda.get_device_properties(X.device).multi_processor_count
     elif X.device.type == "xpu":
         sm_count = torch.xpu.get_device_properties(X.device).gpu_eu_count
+    elif X.device.type == "npu":
+        sm_count = get_npu_core_count()
+    if W is not None:
+        # fp32 for numerical stability especially.
+        _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
+        elementwise_affine = True
+    else:
+        _dW = None
+        elementwise_affine = False
     if n_cols > BLOCK_SIZE:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     # XPU-specific optimization
     kernel_args = {}
     if X.device.type == "xpu":
+        set_large_grf_mode(kernel_args)
+    if BLOCK_SIZE > 256 or n_rows < 4096 * 8 or row_mode:
+        _rms_norm_backward_kernel[grid](
+            dY,
+            dY.stride(0),
+            dX,
+            dX.stride(0),
+            X,
+            X.stride(0),
+            torch_to_triton_dtype[X.dtype],
+            W,
+            W.stride(0) if elementwise_affine else 0,
+            RSTD,
+            RSTD.stride(0),
+            _dW,
+            _dW.stride(0) if elementwise_affine else 0,
+            n_rows,
+            n_cols,
+            offset,
+            rows_per_program,
+            casting_mode,
+            elementwise_affine=elementwise_affine,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+            **kernel_args,  # XPU-specific optimization
+        )
+    else:
+        BLOCK_ROW = 16
+        kernel_args["BLOCK_ROW"] = BLOCK_ROW
+        _block_rms_norm_backward_kernel[grid](
+            dY,
+            dY.stride(0),
+            dX,
+            dX.stride(0),
+            X,
+            X.stride(0),
+            torch_to_triton_dtype[X.dtype],
+            W,
+            W.stride(0) if elementwise_affine else 0,
+            RSTD,
+            RSTD.stride(0),
+            _dW,
+            _dW.stride(0) if elementwise_affine else 0,
+            n_rows,
+            n_cols,
+            offset,
+            casting_mode,
+            elementwise_affine=elementwise_affine,
+            BLOCK_SIZE=BLOCK_SIZE,
+            num_warps=num_warps,
+            **kernel_args,  # XPU-specific optimization
+        )
     dX = dX.view(*shape)
+    if elementwise_affine:
+        dW = _dW.sum(dim=0).to(W.dtype)
+    else:
+        dW = None
     return dX, dW
     @staticmethod
     @ensure_contiguous
+    def forward(ctx, X, W, eps, offset=0.0, casting_mode="llama", in_place=True, row_mode=None):
         """
         X: (B, T, H) or (BxT, H)
         W: (H,)
         """
+        if isinstance(X, torch.distributed.tensor.DTensor):
+            # Input tensor is output of a tensor parallel module and
+            # needs to be gathered to a local tensor to compute
+            # RMSE layer norm on each TP worker.
+            # TODO: support CP.
+            X = X.full_tensor()
+        Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(X, W, eps, offset, casting_mode, row_mode)
         ctx.offset = offset
         ctx.casting_mode = casting_mode
         ctx.in_place = in_place
+        ctx.row_mode = row_mode
         ctx.BLOCK_SIZE = BLOCK_SIZE
         ctx.num_warps = num_warps
+        ctx.elementwise_affine = W is not None
+        if W is not None:
+            ctx.save_for_backward(X, W, RSTD)
+        else:
+            ctx.save_for_backward(X, RSTD)
         return Y
     @staticmethod
         """
         Y: (B, T, H) or (BxT, H)
         """
+        if ctx.elementwise_affine:
+            X, W, RSTD = ctx.saved_tensors
+        else:
+            X, RSTD = ctx.saved_tensors
+            W = None
+        if isinstance(dY, torch.distributed.tensor.DTensor):
+            # Gradients are output of a tensor parallel module and
+            # needs to be gathered to a local tensor for computing RMSE layer.
+            # TODO: support CP.
+            dY = dY.full_tensor()
         dX, dW = rms_norm_backward(
+            dY, X, W, RSTD, ctx.offset, ctx.casting_mode, ctx.BLOCK_SIZE, ctx.num_warps, ctx.in_place, ctx.row_mode
         )
+        return dX, dW, None, None, None, None, None

build/torch-cuda/rope.py CHANGED Viewed

@@ -32,7 +32,7 @@ def _triton_rope(
     # cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
     # stride: (seq_len * head_dim, head_dim, 1)
-    pid = tl.program_id(0)
     # locate start address
     q_ptr = q_ptr + pid * q_row_stride
@@ -236,4 +236,4 @@ class LigerRopeFunction(torch.autograd.Function):
         cos, sin = ctx.saved_tensors
         dq, dk = rope_backward(dq, dk, cos, sin)
-        return dq, dk, None, None, None, None

     # cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
     # stride: (seq_len * head_dim, head_dim, 1)
+    pid = tl.program_id(0).to(tl.int64)
     # locate start address
     q_ptr = q_ptr + pid * q_row_stride
         cos, sin = ctx.saved_tensors
         dq, dk = rope_backward(dq, dk, cos, sin)
+        return dq, dk, None, None, None, None

build/torch-cuda/swiglu.py CHANGED Viewed

@@ -12,7 +12,9 @@ def silu(x):
 @triton.jit
-def _swiglu_forward_kernel(a_ptr, b_ptr, c_ptr, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr):
     program_id = tl.program_id(0).to(tl.int64)
     # locate start index
@@ -24,14 +26,16 @@ def _swiglu_forward_kernel(a_ptr, b_ptr, c_ptr, stride, n_cols: tl.constexpr, BL
     mask = col_offsets < n_cols
     # sigmoid requires type float32
-    a_row = tl.load(a_ptr + col_offsets, mask=mask, other=0).to(tl.float32)
     b_row = tl.load(b_ptr + col_offsets, mask=mask, other=0)
-    c_row = silu(a_row) * b_row
     tl.store(c_ptr + col_offsets, c_row, mask=mask)
 @triton.jit
-def _swiglu_backward_kernel(dc_ptr, a_ptr, b_ptr, stride, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr):
     program_id = tl.program_id(0).to(tl.int64)
     # locate start index
@@ -44,20 +48,21 @@ def _swiglu_backward_kernel(dc_ptr, a_ptr, b_ptr, stride, n_cols: tl.constexpr,
     dc_row = tl.load(dc_ptr + col_offsets, mask=mask, other=0)
     # sigmoid requires type float32
-    a_row = tl.load(a_ptr + col_offsets, mask=mask, other=0).to(tl.float32)
     b_row = tl.load(b_ptr + col_offsets, mask=mask, other=0)
-    # recomputation to save memory
     sig_a = tl.sigmoid(a_row)
     silu_a = a_row * sig_a
     db_row = dc_row * silu_a
-    da_row = dc_row * (silu_a * (1 - sig_a) + sig_a) * b_row
     tl.store(a_ptr + col_offsets, da_row, mask=mask)
     tl.store(b_ptr + col_offsets, db_row, mask=mask)
-def swiglu_forward(a, b):
     ori_shape = a.shape
     n_cols = ori_shape[-1]
@@ -73,6 +78,7 @@ def swiglu_forward(a, b):
         b,
         c,
         c.stride(-2),
         n_cols=n_cols,
         BLOCK_SIZE=BLOCK_SIZE,
         num_warps=num_warps,
@@ -80,7 +86,7 @@ def swiglu_forward(a, b):
     return a, b, c.view(*ori_shape)
-def swiglu_backward(a, b, dc):
     ori_shape = dc.shape
     n_cols = ori_shape[-1]
     dc = dc.view(-1, n_cols)
@@ -93,6 +99,7 @@ def swiglu_backward(a, b, dc):
         a,
         b,
         dc.stride(-2),
         n_cols=n_cols,
         BLOCK_SIZE=BLOCK_SIZE,
         num_warps=num_warps,
@@ -103,14 +110,67 @@ def swiglu_backward(a, b, dc):
 class LigerSiLUMulFunction(torch.autograd.Function):
     @staticmethod
     @ensure_contiguous
-    def forward(ctx, a, b):
-        a, b, c = swiglu_forward(a, b)
-        ctx.save_for_backward(a, b)
-        return c
     @staticmethod
     @ensure_contiguous
     def backward(ctx, dc):
         a, b = ctx.saved_tensors
-        a, b = swiglu_backward(a, b, dc)
-        return a, b

 @triton.jit
+def _swiglu_forward_kernel(
+    a_ptr, b_ptr, c_ptr, stride, gate_multiplier, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
+):
     program_id = tl.program_id(0).to(tl.int64)
     # locate start index
     mask = col_offsets < n_cols
     # sigmoid requires type float32
+    a_row = tl.load(a_ptr + col_offsets, mask=mask, other=0).to(tl.float32) * gate_multiplier
     b_row = tl.load(b_ptr + col_offsets, mask=mask, other=0)
+    c_row = silu(a_row).cast(b_row.dtype) * b_row
     tl.store(c_ptr + col_offsets, c_row, mask=mask)
 @triton.jit
+def _swiglu_backward_kernel(
+    dc_ptr, a_ptr, b_ptr, stride, gate_multiplier, n_cols: tl.constexpr, BLOCK_SIZE: tl.constexpr
+):
     program_id = tl.program_id(0).to(tl.int64)
     # locate start index
     dc_row = tl.load(dc_ptr + col_offsets, mask=mask, other=0)
     # sigmoid requires type float32
+    a_row = tl.load(a_ptr + col_offsets, mask=mask, other=0).to(tl.float32) * gate_multiplier
     b_row = tl.load(b_ptr + col_offsets, mask=mask, other=0)
+    # recomputation to save memory. a_row already holds a * gate_multiplier.
     sig_a = tl.sigmoid(a_row)
     silu_a = a_row * sig_a
     db_row = dc_row * silu_a
+    # chain rule pulls an extra factor of gate_multiplier through the pre-activation scaling
+    da_row = dc_row * (silu_a * (1 - sig_a) + sig_a) * b_row * gate_multiplier
     tl.store(a_ptr + col_offsets, da_row, mask=mask)
     tl.store(b_ptr + col_offsets, db_row, mask=mask)
+def swiglu_forward(a, b, gate_multiplier: float = 1.0):
     ori_shape = a.shape
     n_cols = ori_shape[-1]
         b,
         c,
         c.stride(-2),
+        float(gate_multiplier),
         n_cols=n_cols,
         BLOCK_SIZE=BLOCK_SIZE,
         num_warps=num_warps,
     return a, b, c.view(*ori_shape)
+def swiglu_backward(a, b, dc, gate_multiplier: float = 1.0):
     ori_shape = dc.shape
     n_cols = ori_shape[-1]
     dc = dc.view(-1, n_cols)
         a,
         b,
         dc.stride(-2),
+        float(gate_multiplier),
         n_cols=n_cols,
         BLOCK_SIZE=BLOCK_SIZE,
         num_warps=num_warps,
 class LigerSiLUMulFunction(torch.autograd.Function):
     @staticmethod
     @ensure_contiguous
+    def forward(ctx, a, b, gate_multiplier: float = 1.0, down_multiplier: float = 1.0):
+        gate_multiplier = float(gate_multiplier)
+        down_multiplier = float(down_multiplier)
+        ctx.gate_multiplier = gate_multiplier
+        ctx.down_multiplier = down_multiplier
+        if isinstance(a, torch.distributed.tensor.DTensor) or isinstance(b, torch.distributed.tensor.DTensor):
+            device_mesh, placements = (
+                (a.device_mesh, a.placements)
+                if isinstance(a, torch.distributed.tensor.DTensor)
+                else (b.device_mesh, b.placements)
+            )
+            # Assume that full tensors are gathered before and identical across
+            # the associated process groups.
+            if not isinstance(a, torch.distributed.tensor.DTensor):
+                a = torch.distributed.tensor.distribute_tensor(a, device_mesh=device_mesh, placements=placements)
+            if not isinstance(b, torch.distributed.tensor.DTensor):
+                b = torch.distributed.tensor.distribute_tensor(b, device_mesh=device_mesh, placements=placements)
+            a_local, b_local, c_local = swiglu_forward(a.to_local(), b.to_local(), gate_multiplier)
+            if down_multiplier != 1.0:
+                c_local = c_local * down_multiplier
+            ctx.save_for_backward(a_local, b_local)
+            ctx.dtensor_metadata = (device_mesh, placements)
+            return torch.distributed.tensor.DTensor.from_local(c_local, device_mesh, placements)
+        else:
+            a, b, c = swiglu_forward(a, b, gate_multiplier)
+            if down_multiplier != 1.0:
+                c = c * down_multiplier
+            ctx.save_for_backward(a, b)
+            ctx.dtensor_metadata = None
+            return c
     @staticmethod
     @ensure_contiguous
     def backward(ctx, dc):
         a, b = ctx.saved_tensors
+        gate_multiplier = ctx.gate_multiplier
+        down_multiplier = ctx.down_multiplier
+        if ctx.dtensor_metadata is not None:
+            device_mesh, placements = ctx.dtensor_metadata
+            # Assume that full tensors are gathered before and identical across
+            # the associated process groups.
+            dc_local = (
+                dc.to_local()
+                if isinstance(dc, torch.distributed.tensor.DTensor)
+                else torch.distributed.tensor.distribute_tensor(dc, device_mesh=device_mesh, placements=placements)
+            )
+            if down_multiplier != 1.0:
+                dc_local = dc_local * down_multiplier
+            a_local, b_local = swiglu_backward(a, b, dc_local, gate_multiplier)
+            return (
+                torch.distributed.tensor.DTensor.from_local(a_local, device_mesh, placements),
+                torch.distributed.tensor.DTensor.from_local(b_local, device_mesh, placements),
+                None,
+                None,
+            )
+        if down_multiplier != 1.0:
+            dc = dc * down_multiplier
+        a, b = swiglu_backward(a, b, dc, gate_multiplier)
+        return a, b, None, None

build/torch-cuda/tvd.py CHANGED Viewed

@@ -49,6 +49,7 @@ def _tv_distance_kernel(
     label_ptr,
     ignore_index: tl.constexpr,
     n_cols,
     BLOCK_SIZE: tl.constexpr,
     HAS_LABEL: tl.constexpr,
     reduction: tl.constexpr = _REDUCTION_MODE_BATCHMEAN,
@@ -84,7 +85,8 @@ def _tv_distance_kernel(
         # TVD(P || Q) = 0.5 * |P - Q|
         tv_loss = 0.5 * tl.abs(p - q)
-        grad_res = tl.where(p > q, 0.5, -0.5)
         tl.store(grads_ptr + offsets, grad_res, mask=mask)
@@ -94,7 +96,8 @@ def _tv_distance_kernel(
             loss_sum += tl.sum(tv_loss, axis=0)
     if reduction != _REDUCTION_MODE_NONE:
-        tl.store(loss_ptr, loss_sum)
 def tv_distance_forward_triton(p, q, shift_labels, reduction, ignore_index, has_label):
@@ -113,6 +116,14 @@ def tv_distance_forward_triton(p, q, shift_labels, reduction, ignore_index, has_
     n_non_ignore = (shift_labels != ignore_index).sum().item() if has_label else BT
     _tv_distance_kernel[grid](
         p,
         p.stride(0),
@@ -125,18 +136,18 @@ def tv_distance_forward_triton(p, q, shift_labels, reduction, ignore_index, has_
         shift_labels if has_label else torch.empty(1, device=p.device),
         ignore_index,
         V,
         BLOCK_SIZE=BLOCK_SIZE,
         HAS_LABEL=has_label,
         num_warps=num_warps,
         reduction=reduction,
     )
-    if reduction == _REDUCTION_MODE_BATCHMEAN.value:
-        return output_tensor.sum() / n_non_ignore, grads / n_non_ignore
     elif reduction == _REDUCTION_MODE_SUM.value:
         return output_tensor.sum(dim=0), grads
-    elif reduction == _REDUCTION_MODE_MEAN.value:
-        return output_tensor.sum() / (n_non_ignore * V), grads / (n_non_ignore * V)
     else:
         return output_tensor, grads
@@ -204,4 +215,4 @@ class LigerTVDLossFunction(torch.autograd.Function):
         (grads,) = ctx.saved_tensors
         grads = tvd_backward_triton(grad_output, grads)
-        return grads, None, None, None, None

     label_ptr,
     ignore_index: tl.constexpr,
     n_cols,
+    scale,  # pre-computed reduction scale for gradients (fused into kernel)
     BLOCK_SIZE: tl.constexpr,
     HAS_LABEL: tl.constexpr,
     reduction: tl.constexpr = _REDUCTION_MODE_BATCHMEAN,
         # TVD(P || Q) = 0.5 * |P - Q|
         tv_loss = 0.5 * tl.abs(p - q)
+        # Fuse reduction scaling into gradient computation (eliminates separate Python division)
+        grad_res = tl.where(p > q, 0.5 * scale, -0.5 * scale)
         tl.store(grads_ptr + offsets, grad_res, mask=mask)
             loss_sum += tl.sum(tv_loss, axis=0)
     if reduction != _REDUCTION_MODE_NONE:
+        # Fuse reduction scaling into loss (same scale as gradients; avoids Python division)
+        tl.store(loss_ptr, loss_sum * scale)
 def tv_distance_forward_triton(p, q, shift_labels, reduction, ignore_index, has_label):
     n_non_ignore = (shift_labels != ignore_index).sum().item() if has_label else BT
+    # Pre-compute gradient scale factor (fused into kernel to avoid separate division)
+    if reduction == _REDUCTION_MODE_BATCHMEAN.value:
+        scale = 1.0 / n_non_ignore
+    elif reduction == _REDUCTION_MODE_MEAN.value:
+        scale = 1.0 / (n_non_ignore * V)
+    else:
+        scale = 1.0
     _tv_distance_kernel[grid](
         p,
         p.stride(0),
         shift_labels if has_label else torch.empty(1, device=p.device),
         ignore_index,
         V,
+        scale,
         BLOCK_SIZE=BLOCK_SIZE,
         HAS_LABEL=has_label,
         num_warps=num_warps,
         reduction=reduction,
     )
+    # Loss and gradients are already scaled inside the kernel — no separate division needed
+    if reduction in (_REDUCTION_MODE_BATCHMEAN.value, _REDUCTION_MODE_MEAN.value):
+        return output_tensor.sum(), grads
     elif reduction == _REDUCTION_MODE_SUM.value:
         return output_tensor.sum(dim=0), grads
     else:
         return output_tensor, grads
         (grads,) = ctx.saved_tensors
         grads = tvd_backward_triton(grad_output, grads)
+        return grads, None, None, None, None

build/torch-cuda/utils.py CHANGED Viewed

@@ -22,17 +22,33 @@ import triton.language as tl
 from packaging.version import Version
 def infer_device():
     """
     Get current device name based on available devices
     """
     if torch.cuda.is_available():  # Works for both Nvidia and AMD
         return "cuda"
     elif torch.xpu.is_available():
         return "xpu"
     else:
         return "cpu"
 def is_hip() -> bool:
     return torch.version.hip is not None
@@ -86,6 +102,8 @@ def get_amp_custom_fwd_bwd() -> Callable:
             functools.partial(torch.amp.custom_fwd, device_type=device),
             functools.partial(torch.amp.custom_bwd, device_type=device),
         )
     return torch.cuda.amp.custom_fwd, torch.cuda.amp.custom_bwd
@@ -132,4 +150,27 @@ def element_mul_kernel(
     for i in range(0, n_cols, BLOCK_SIZE):
         X_offsets = i + tl.arange(0, BLOCK_SIZE)
         X_block = tl.load(X_ptr + X_offsets, mask=X_offsets < n_cols)
-        tl.store(X_ptr + X_offsets, X_block * grad_output, mask=X_offsets < n_cols)

 from packaging.version import Version
+def is_npu_available() -> bool:
+    """Detect Ascend NPU availability."""
+    try:
+        from transformers.utils import is_torch_npu_available
+        return is_torch_npu_available()
+    except Exception:
+        return False
 def infer_device():
     """
     Get current device name based on available devices
     """
     if torch.cuda.is_available():  # Works for both Nvidia and AMD
         return "cuda"
+    # Use Ascend NPU if available (torch.npu)
+    elif is_npu_available():
+        return "npu"
+    # XPU (Intel) if available
     elif torch.xpu.is_available():
         return "xpu"
     else:
         return "cpu"
 def is_hip() -> bool:
     return torch.version.hip is not None
             functools.partial(torch.amp.custom_fwd, device_type=device),
             functools.partial(torch.amp.custom_bwd, device_type=device),
         )
+    if hasattr(torch, "npu") and getattr(torch.npu, "amp", None) is not None:
+        return torch.npu.amp.custom_fwd, torch.npu.amp.custom_bwd
     return torch.cuda.amp.custom_fwd, torch.cuda.amp.custom_bwd
     for i in range(0, n_cols, BLOCK_SIZE):
         X_offsets = i + tl.arange(0, BLOCK_SIZE)
         X_block = tl.load(X_ptr + X_offsets, mask=X_offsets < n_cols)
+        tl.store(X_ptr + X_offsets, X_block * grad_output, mask=X_offsets < n_cols)
+def get_npu_core_count(default: int = 20) -> int:
+    """Return NPU vector core count.
+    Fallback to `default` if Triton runtime or NPU device is unavailable.
+    """
+    try:
+        utils = triton.runtime.driver.active.utils
+        props = utils.get_device_properties(0)
+        return int(props.get("num_vectorcore", default))
+    except Exception:
+        return default
+def set_large_grf_mode(kernel_args: dict):
+    """Set large GRF mode for XPU devices."""
+    # On XPU triton installed along with pytorch-xpu will be called `pytorch-triton-xpu`,
+    # triton XPU installed from source will be called `triton`.
+    if compare_version("pytorch-triton-xpu", operator.ge, "3.6.0") or compare_version("triton", operator.ge, "3.6.0"):
+        kernel_args["grf_mode"] = "256"
+    else:
+        # API was changed in https://github.com/intel/intel-xpu-backend-for-triton/pull/5430
+        kernel_args["grf_mode"] = "large"