Add fused ApplyRoPE and RMSNorm kernels written in OpenAI Triton.

Browse files

Files changed (4) hide show

config.json +1 -0
configuration_qwen.py +2 -0
modeling_qwen.py +35 -2
triton_kernels.py +147 -0

config.json CHANGED Viewed

@@ -44,6 +44,7 @@
   "use_cache": true,
   "use_dynamic_ntk": true,
   "use_flash_attn": "auto",
   "use_logn_attn": true,
   "vocab_size": 151936
 }

   "use_cache": true,
   "use_dynamic_ntk": true,
   "use_flash_attn": "auto",
+  "use_triton": "auto",
   "use_logn_attn": true,
   "vocab_size": 151936
 }

configuration_qwen.py CHANGED Viewed

@@ -32,6 +32,7 @@ class QWenConfig(PretrainedConfig):
         use_dynamic_ntk=True,
         use_logn_attn=True,
         use_flash_attn="auto",
         intermediate_size=22016,
         no_bias=True,
         tie_word_embeddings=False,
@@ -61,6 +62,7 @@ class QWenConfig(PretrainedConfig):
         self.use_dynamic_ntk = use_dynamic_ntk
         self.use_logn_attn = use_logn_attn
         self.use_flash_attn = use_flash_attn
         self.no_bias = no_bias
         self.use_cache_quantization = use_cache_quantization
         self.use_cache_kernel = use_cache_kernel

         use_dynamic_ntk=True,
         use_logn_attn=True,
         use_flash_attn="auto",
+        use_triton="auto",
         intermediate_size=22016,
         no_bias=True,
         tie_word_embeddings=False,
         self.use_dynamic_ntk = use_dynamic_ntk
         self.use_logn_attn = use_logn_attn
         self.use_flash_attn = use_flash_attn
+        self.use_triton = use_triton
         self.no_bias = no_bias
         self.use_cache_quantization = use_cache_quantization
         self.use_cache_kernel = use_cache_kernel

modeling_qwen.py CHANGED Viewed

@@ -36,7 +36,7 @@ except ImportError:
 from torch import nn
 SUPPORT_CUDA = torch.cuda.is_available()
-SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
 SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
 SUPPORT_TORCH2 = hasattr(torch, '__version__') and int(torch.__version__.split(".")[0]) >= 2
@@ -77,6 +77,7 @@ We detect you have activated flash attention support, but running model computat
 """
 apply_rotary_emb_func = None
 rms_norm = None
 flash_attn_unpadded_func = None
@@ -116,6 +117,30 @@ def _import_flash_attn():
             "https://github.com/Dao-AILab/flash-attention"
         )
 def quantize_cache_v(fdata, bits, qmax, qmin):
     # b, s, head, h-dim->b, head, s, h-dim
     qtype = torch.uint8
@@ -1052,6 +1077,12 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         if config.use_flash_attn:
             _import_flash_attn()
         self.transformer = QWenModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -1412,7 +1443,9 @@ def _rotate_half(x):
 def apply_rotary_pos_emb(t, freqs):
     cos, sin = freqs
-    if apply_rotary_emb_func is not None and t.is_cuda:
         t_ = t.float()
         cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
         sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]

 from torch import nn
 SUPPORT_CUDA = torch.cuda.is_available()
+SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 8
 SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
 SUPPORT_TORCH2 = hasattr(torch, '__version__') and int(torch.__version__.split(".")[0]) >= 2
 """
 apply_rotary_emb_func = None
+apply_rotary_emb_func_triton = None
 rms_norm = None
 flash_attn_unpadded_func = None
             "https://github.com/Dao-AILab/flash-attention"
         )
+def _import_triton():
+    global apply_rotary_emb_func_triton, rms_norm
+    try:
+        from .triton_kernels import triton, apply_rotary_emb as __apply_rotary_emb, rms_norm as __rms_norm
+        if apply_rotary_emb_func is not None:
+            logger.warn(
+                "rotary kernel imported from flash_attn is replaced by Triton kernel."
+            )
+        apply_rotary_emb_func_triton = __apply_rotary_emb
+        if rms_norm is not None:
+            logger.warn(
+                "rms_norm kernel imported from flash_attn is replaced by Triton kernel."
+            )
+        rms_norm = __rms_norm
+    except ImportError:
+        logger.warn("Warning: Failed to import Triton kernels.")
+        return
+    if int(triton.__version__.split(".")[1]) == 0:
+        logger.warn(
+            "Triton 2.0 is detected in your environment. It is recommended that you upgrade to Triton 2.1 by "
+            "`pip install triton==2.1` for better performance if you do not use TorchInductor in PyTorch 2.0."
+        )
 def quantize_cache_v(fdata, bits, qmax, qmin):
     # b, s, head, h-dim->b, head, s, h-dim
     qtype = torch.uint8
         if config.use_flash_attn:
             _import_flash_attn()
+        if config.use_triton == "auto":
+            config.use_triton = SUPPORT_TORCH2
+        if config.use_triton:
+            logger.warn("Try importing Triton kernels for faster inference...")
+            _import_triton()
         self.transformer = QWenModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 def apply_rotary_pos_emb(t, freqs):
     cos, sin = freqs
+    if apply_rotary_emb_func_triton is not None and t.is_cuda:
+        return apply_rotary_emb_func_triton(t, cos, sin)
+    elif apply_rotary_emb_func is not None and t.is_cuda:
         t_ = t.float()
         cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
         sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]

triton_kernels.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from functools import partial
+from typing import Any, Callable, Dict, Hashable, Tuple
+import torch
+import triton
+import triton.language as tl
+from torch.autograd.function import Function, FunctionCtx
+from triton.compiler import CompiledKernel
+from triton.runtime import KernelInterface
+from triton.runtime.autotuner import Autotuner
+try:
+    import triton.language.math as tlmath  # Triton 2.1
+except ImportError:
+    import triton.language.libdevice as tlmath  # Triton 2.0
+class TritonKernel:
+    def __init__(
+        self,
+        kernel_fn_: KernelInterface,
+        grid_fn: Callable[[Tuple[Any, ...]], Tuple[int, int, int]],
+    ) -> None:
+        self.kernel_fn_ = kernel_fn_
+        self.grid_fn_ = grid_fn
+        self.kernel_cache_: Dict[Hashable, CompiledKernel] = {}
+    def run(self, *args, **kwargs):
+        # Set current device
+        input_device = args[0].device
+        prev_dev_idx, cur_dev_idx = -1, torch.cuda.current_device()
+        if input_device.index != cur_dev_idx:
+            prev_dev_idx = cur_dev_idx
+            torch.cuda.set_device(input_device.index)
+        # Compute grid
+        grid = self.grid_fn_(args)
+        # Use cached kernel if possible
+        kernel_key = (input_device,)
+        if isinstance(self.kernel_fn_, Autotuner):
+            kernel_key += tuple(args[ki] for ki in self.kernel_fn_.key_idx)
+        else:
+            kernel_key += tuple(kwargs.items())
+        if kernel_key in self.kernel_cache_:
+            kernel = self.kernel_cache_[kernel_key]
+            kernel[grid](*args)
+            return
+        # Compile new kernel
+        if isinstance(self.kernel_fn_, Autotuner):
+            kernel = self.kernel_fn_[grid](*args)
+        else:
+            kernel = self.kernel_fn_[grid](*args, **kwargs)
+        # Store kernel
+        self.kernel_cache_[kernel_key] = kernel
+        # Restore previous device
+        torch.cuda.set_device(prev_dev_idx)
+@triton.jit
+def _apply_rope_fwd_kernel(X, Cos, Sin, Y, HEAD_DIM: tl.constexpr):
+    batch_idx, tok_idx, head_idx = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    seq_len, num_heads = tl.num_programs(1), tl.num_programs(2)
+    block_idx = tl.arange(0, HEAD_DIM)
+    x_base_idx = ((batch_idx * seq_len + tok_idx) * num_heads * 3 + head_idx) * HEAD_DIM
+    x = tl.load(X + x_base_idx + block_idx)
+    freq_idx = tok_idx * HEAD_DIM + block_idx
+    cos = tl.load(Cos + freq_idx)
+    rot_idx = (HEAD_DIM // 2 + block_idx) % HEAD_DIM
+    x_rot = tl.load(X + x_base_idx + rot_idx)
+    x_rot = tl.where(block_idx >= HEAD_DIM // 2, x_rot, -x_rot)
+    sin = tl.load(Sin + freq_idx)
+    y_idx = (
+        (batch_idx * seq_len + tok_idx) * num_heads + head_idx
+    ) * HEAD_DIM + block_idx
+    y = x * cos + x_rot * sin
+    tl.store(Y + y_idx, y)
+apply_rope_fwd_kernel = TritonKernel(
+    _apply_rope_fwd_kernel, lambda args: tuple(args[0].shape[:3])
+)
+class ApplyRotaryEmb(Function):
+    @staticmethod
+    def forward(
+        ctx: FunctionCtx, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+    ):
+        y = torch.empty(x.shape, dtype=x.dtype, device=x.device)
+        apply_rope_fwd_kernel.run(x, cos, sin, y, HEAD_DIM=x.size(-1))
+        return y
+def apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
+    return ApplyRotaryEmb.apply(x, cos, sin)
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE": 4096}),
+        triton.Config({"BLOCK_SIZE": 2048}),
+        triton.Config({"BLOCK_SIZE": 1024}),
+    ],
+    key=[],
+)
+@triton.jit
+def _rms_norm_fwd_kernel(X, W, Y, eps, hidden_dim, BLOCK_SIZE: tl.constexpr):
+    tok_idx = tl.program_id(0)
+    mean_sq = tl.zeros([BLOCK_SIZE], tl.float32)
+    for offset in range(0, hidden_dim, BLOCK_SIZE):
+        dim_idx = offset + tl.arange(0, BLOCK_SIZE)
+        x = tl.load(
+            X + tok_idx * hidden_dim + dim_idx, mask=dim_idx < hidden_dim, other=0
+        ).to(tl.float32)
+        mean_sq += x * x / hidden_dim
+    rrms = tlmath.rsqrt(tl.sum(mean_sq, 0) + eps)
+    for offset in range(0, hidden_dim, BLOCK_SIZE):
+        dim_idx = offset + tl.arange(0, BLOCK_SIZE)
+        dim_mask = dim_idx < hidden_dim
+        hidden_idx = tok_idx * hidden_dim + dim_idx
+        x = tl.load(X + hidden_idx, mask=dim_mask, other=0)
+        w = tl.load(W + dim_idx, mask=dim_mask, other=0)
+        y = x * rrms * w
+        tl.store(Y + hidden_idx, y.to(Y.dtype.element_ty), mask=dim_mask)
+rms_norm_fwd_kernel = TritonKernel(
+    _rms_norm_fwd_kernel, lambda args: (args[0].shape[:-1].numel(), 1, 1)
+)
+class RMSNorm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx: FunctionCtx, x: torch.Tensor, w: torch.Tensor, eps: float):
+        y = torch.empty_like(x)
+        rms_norm_fwd_kernel.run(x, w, y, eps, x.size(-1))
+        return y
+def rms_norm(x: torch.Tensor, weight: torch.Tensor, eps: float):
+    return RMSNorm.apply(x, weight, eps)