zaydzuhri commited on Jan 17

Commit

3332857

verified ·

1 Parent(s): 05685b7

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fla/models/abc/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/bitnet/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/bitnet/__pycache__/configuration_bitnet.cpython-312.pyc +0 -0
fla/models/forgetting_transformer/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/forgetting_transformer/__pycache__/configuration_forgetting_transformer.cpython-312.pyc +0 -0
fla/models/forgetting_transformer/__pycache__/modeling_forgetting_transformer.cpython-312.pyc +0 -0
fla/models/gated_deltanet/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/gated_deltanet/__pycache__/configuration_gated_deltanet.cpython-312.pyc +0 -0
fla/models/mamba2/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/mamba2/__pycache__/configuration_mamba2.cpython-312.pyc +0 -0
fla/models/rwkv6/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/rwkv7/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/samba/__pycache__/configuration_samba.cpython-312.pyc +0 -0
fla/models/samba/__pycache__/modeling_samba.cpython-312.pyc +0 -0
fla/models/transformer/__pycache__/configuration_transformer.cpython-312.pyc +0 -0
fla/models/transformer/__pycache__/modeling_transformer.cpython-312.pyc +0 -0
fla/models/transformer_mtp/__pycache__/__init__.cpython-312.pyc +0 -0
fla/modules/__pycache__/__init__.cpython-312.pyc +0 -0
fla/modules/__pycache__/activations.cpython-312.pyc +0 -0
fla/modules/__pycache__/fused_cross_entropy.cpython-312.pyc +0 -0
fla/modules/__pycache__/fused_linear_listnet_loss.cpython-312.pyc +0 -0
fla/modules/__pycache__/layernorm.cpython-312.pyc +0 -0
fla/modules/__pycache__/mlp.cpython-312.pyc +0 -0
fla/ops/abc/__pycache__/__init__.cpython-312.pyc +0 -0
fla/ops/attn/__init__.py +7 -0
fla/ops/based/__pycache__/__init__.cpython-312.pyc +0 -0
fla/ops/common/chunk_delta_h.py +399 -0
fla/ops/common/chunk_h_parallel.py +650 -0
fla/ops/delta_rule/__pycache__/chunk.cpython-312.pyc +0 -0
fla/ops/delta_rule/__pycache__/fused_chunk.cpython-312.pyc +0 -0
fla/ops/delta_rule/__pycache__/fused_recurrent.cpython-312.pyc +0 -0
fla/ops/gated_delta_rule/__pycache__/__init__.cpython-312.pyc +0 -0
fla/ops/generalized_delta_rule/dplr/__init__.py +7 -0
fla/ops/generalized_delta_rule/dplr/__pycache__/__init__.cpython-312.pyc +0 -0
fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_h_bwd.cpython-312.pyc +0 -0
fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_h_fwd.cpython-312.pyc +0 -0
fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_o_bwd.cpython-312.pyc +0 -0
fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_o_fwd.cpython-312.pyc +0 -0
fla/ops/generalized_delta_rule/dplr/__pycache__/fused_recurrent.cpython-312.pyc +0 -0
fla/ops/generalized_delta_rule/dplr/chunk_A_fwd.py +324 -0
fla/ops/generalized_delta_rule/dplr/chunk_h_bwd.py +196 -0
fla/ops/generalized_delta_rule/dplr/chunk_h_fwd.py +197 -0
fla/ops/generalized_delta_rule/dplr/chunk_o_fwd.py +138 -0
fla/ops/generalized_delta_rule/dplr/wy_fast_bwd.py +184 -0
fla/ops/generalized_delta_rule/dplr/wy_fast_fwd.py +318 -0
fla/ops/generalized_delta_rule/iplr/__init__.py +7 -0
fla/ops/generalized_delta_rule/iplr/__pycache__/chunk.cpython-312.pyc +0 -0
fla/ops/generalized_delta_rule/iplr/chunk.py +528 -0
fla/ops/generalized_delta_rule/iplr/naive.py +69 -0
fla/ops/generalized_delta_rule/iplr/wy_fast.py +338 -0

fla/models/abc/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (653 Bytes). View file

fla/models/bitnet/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (678 Bytes). View file

fla/models/bitnet/__pycache__/configuration_bitnet.cpython-312.pyc ADDED Viewed

Binary file (2.36 kB). View file

fla/models/forgetting_transformer/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (813 Bytes). View file

fla/models/forgetting_transformer/__pycache__/configuration_forgetting_transformer.cpython-312.pyc ADDED Viewed

Binary file (2.49 kB). View file

fla/models/forgetting_transformer/__pycache__/modeling_forgetting_transformer.cpython-312.pyc ADDED Viewed

Binary file (17.2 kB). View file

fla/models/gated_deltanet/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (742 Bytes). View file

fla/models/gated_deltanet/__pycache__/configuration_gated_deltanet.cpython-312.pyc ADDED Viewed

Binary file (3.33 kB). View file

fla/models/mamba2/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (691 Bytes). View file

fla/models/mamba2/__pycache__/configuration_mamba2.cpython-312.pyc ADDED Viewed

Binary file (7.5 kB). View file

fla/models/rwkv6/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (683 Bytes). View file

fla/models/rwkv7/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (683 Bytes). View file

fla/models/samba/__pycache__/configuration_samba.cpython-312.pyc ADDED Viewed

Binary file (3.39 kB). View file

fla/models/samba/__pycache__/modeling_samba.cpython-312.pyc ADDED Viewed

Binary file (20.9 kB). View file

fla/models/transformer/__pycache__/configuration_transformer.cpython-312.pyc ADDED Viewed

Binary file (2.51 kB). View file

fla/models/transformer/__pycache__/modeling_transformer.cpython-312.pyc ADDED Viewed

Binary file (17.1 kB). View file

fla/models/transformer_mtp/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (791 Bytes). View file

fla/modules/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.34 kB). View file

fla/modules/__pycache__/activations.cpython-312.pyc ADDED Viewed

Binary file (23 kB). View file

fla/modules/__pycache__/fused_cross_entropy.cpython-312.pyc ADDED Viewed

Binary file (16 kB). View file

fla/modules/__pycache__/fused_linear_listnet_loss.cpython-312.pyc ADDED Viewed

Binary file (17.8 kB). View file

fla/modules/__pycache__/layernorm.cpython-312.pyc ADDED Viewed

Binary file (43.4 kB). View file

fla/modules/__pycache__/mlp.cpython-312.pyc ADDED Viewed

Binary file (6.23 kB). View file

fla/ops/abc/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (208 Bytes). View file

fla/ops/attn/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# -*- coding: utf-8 -*-
+from .parallel import parallel_attn
+__all__ = [
+    'parallel_attn'
+]

fla/ops/based/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (282 Bytes). View file

fla/ops/common/chunk_delta_h.py ADDED Viewed

	@@ -0,0 +1,399 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from fla.ops.common.utils import prepare_chunk_offsets
+from fla.ops.utils.op import exp
+from fla.utils import check_shared_mem, is_nvidia_hopper, use_cuda_graph
+NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8, 16]
+@triton.heuristics({
+    'USE_G': lambda args: args['g'] is not None,
+    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
+    'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
+    'USE_OFFSETS': lambda args: args['offsets'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in NUM_WARPS
+        for num_stages in [2, 3, 4]
+    ],
+    key=['H', 'K', 'V', 'BT', 'BK', 'BV', 'USE_G'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_gated_delta_rule_fwd_kernel_h(
+    k,
+    v,
+    d,
+    v_new,
+    g,
+    h,
+    h0,
+    ht,
+    offsets,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    NT: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_h = i_nh // H, i_nh % H
+    if USE_OFFSETS:
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+    # [BK, BV]
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        p_h0 = tl.make_block_ptr(h0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_h = tl.load(p_h0, boundary_check=(0, 1)).to(tl.float32)
+    for i_t in range(NT):
+        if HEAD_FIRST:
+            p_h = tl.make_block_ptr(h + (i_nh * NT + i_t) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        else:
+            p_h = tl.make_block_ptr(h + ((boh + i_t) * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_h, b_h.to(p_h.dtype.element_ty), boundary_check=(0, 1))
+        b_hc = tl.zeros([BK, BV], dtype=tl.float32)
+        if USE_G:
+            last_idx = min((i_t + 1) * BT, T) - 1
+            if HEAD_FIRST:
+                b_g_last = tl.load(g + i_nh * T + last_idx)
+            else:
+                b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
+        else:
+            b_g_last = None
+            last_idx = None
+        # since we need to make all DK in the SRAM. we face serve SRAM memory burden. By subchunking we allievate such burden
+        for i_c in range(tl.cdiv(min(BT, T - i_t * BT), BC)):
+            if HEAD_FIRST:
+                p_k = tl.make_block_ptr(k + i_nh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_d = tl.make_block_ptr(d + i_nh * T*K, (T, K), (K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+                p_v = tl.make_block_ptr(v + i_nh * T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_v_new = tl.make_block_ptr(v_new+i_nh*T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_g = tl.make_block_ptr(g + i_nh * T, (T,), (1,), (i_t * BT + i_c * BC,), (BC,), (0,)) if USE_G else None
+            else:
+                p_k = tl.make_block_ptr(k+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_d = tl.make_block_ptr(d+(bos*H+i_h)*K, (T, K), (H*K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+                p_v = tl.make_block_ptr(v+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_v_new = tl.make_block_ptr(v_new+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT+i_c*BC, i_v * BV), (BC, BV), (1, 0))
+                p_g = tl.make_block_ptr(g+bos*H+i_h, (T,), (H,), (i_t*BT+i_c*BC, ), (BC,), (0,)) if USE_G else None
+            b_g = tl.load(p_g, boundary_check=(0, )) if USE_G else None
+            # [BK, BC]
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_k = (b_k * exp(b_g_last - b_g)[None, :]).to(b_k.dtype) if USE_G else b_k
+            # [BC, BK]
+            b_d = tl.load(p_d, boundary_check=(0, 1))
+            b_d = (b_d * exp(b_g)[:, None]).to(b_d.dtype) if USE_G else b_d
+            # [BC, BV]
+            b_v = tl.load(p_v, boundary_check=(0, 1))
+            b_v2 = b_v - tl.dot(b_d, b_h.to(b_d.dtype))
+            # [BK, BV]
+            tl.store(p_v_new, b_v2.to(p_v_new.dtype.element_ty), boundary_check=(0, 1))
+            b_hc += tl.dot(b_k, b_v2.to(b_k.dtype), allow_tf32=False)
+        b_h *= exp(b_g_last) if USE_G else 1
+        b_h += b_hc
+    if STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'USE_G': lambda args: args['g'] is not None,
+    'USE_INITIAL_STATE': lambda args: args['dh0'] is not None,
+    'USE_FINAL_STATE_GRADIENT': lambda args: args['dht'] is not None,
+    'USE_OFFSETS': lambda args: args['offsets'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in NUM_WARPS
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT', 'BK', 'BV', 'USE_G'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_gated_delta_rule_bwd_kernel_dhu(
+    q,
+    k,
+    d,
+    g,
+    dht,
+    dh0,
+    do,
+    dh,
+    dv,
+    dv2,
+    offsets,
+    chunk_offsets,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    USE_FINAL_STATE_GRADIENT: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_h = i_nh // H, i_nh % H
+    if USE_OFFSETS:
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+    # [BK, BV]
+    b_dh = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_FINAL_STATE_GRADIENT:
+        p_dht = tl.make_block_ptr(dht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_dh += tl.load(p_dht, boundary_check=(0, 1))
+    for i_t in range(NT - 1, -1, -1):
+        if HEAD_FIRST:
+            p_dh = tl.make_block_ptr(dh + (i_nh * NT + i_t) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        else:
+            p_dh = tl.make_block_ptr(dh + ((boh+i_t) * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_dh, b_dh.to(p_dh.dtype.element_ty), boundary_check=(0, 1))
+        b_dh_tmp = tl.zeros([BK, BV], dtype=tl.float32)
+        if USE_G:
+            last_idx = min((i_t + 1) * BT, T) - 1
+            if HEAD_FIRST:
+                bg_last = tl.load(g + i_nh * T + last_idx)
+            else:
+                bg_last = tl.load(g + (bos + last_idx) * H + i_h)
+        else:
+            bg_last = None
+            last_idx = None
+        for i_c in range(tl.cdiv(BT, BC) - 1, -1, -1):
+            if HEAD_FIRST:
+                p_q = tl.make_block_ptr(q + i_nh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_k = tl.make_block_ptr(k + i_nh * T*K, (T, K), (K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+                p_d = tl.make_block_ptr(d + i_nh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_dv = tl.make_block_ptr(dv + i_nh * T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_do = tl.make_block_ptr(do + i_nh * T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_g = tl.make_block_ptr(g + i_nh * T, (T,), (1,), (i_t * BT + i_c * BC,), (BC,), (0,)) if USE_G else None
+                p_dv2 = tl.make_block_ptr(dv2 + i_nh * T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            else:
+                p_q = tl.make_block_ptr(q+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_k = tl.make_block_ptr(k+(bos*H+i_h)*K, (T, K), (H*K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+                p_d = tl.make_block_ptr(d+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_dv = tl.make_block_ptr(dv+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_do = tl.make_block_ptr(do+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_g = tl.make_block_ptr(g+bos*H+i_h, (T,), (H,), (i_t*BT + i_c * BC,), (BC,), (0,)) if USE_G else None
+                p_dv2 = tl.make_block_ptr(dv2+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            b_g = tl.load(p_g, boundary_check=(0,)) if USE_G else None
+            # [BK, BT]
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_q = (b_q * scale * exp(b_g)[None, :]).to(b_q.dtype) if USE_G else (b_q * scale).to(b_q.dtype)
+            # [BT, BK]
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_d = tl.load(p_d, boundary_check=(0, 1))
+            b_k = (b_k * exp(bg_last - b_g)[:, None]).to(b_k.dtype) if USE_G else b_k
+            b_d = (b_d * exp(b_g)[None, :]).to(b_d.dtype) if USE_G else b_d
+            # [BT, V]
+            b_do = tl.load(p_do, boundary_check=(0, 1))
+            b_dv = tl.load(p_dv, boundary_check=(0, 1))
+            b_dv2 = b_dv + tl.dot(b_k, b_dh.to(b_k.dtype), allow_tf32=False)
+            tl.store(p_dv2, b_dv2.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+            # [BK, BV]
+            b_dh_tmp += tl.dot(b_q, b_do.to(b_q.dtype), allow_tf32=False)
+            b_dh_tmp -= tl.dot(b_d, b_dv2.to(b_q.dtype), allow_tf32=False)
+        b_dh *= exp(bg_last) if USE_G else 1
+        b_dh += b_dh_tmp
+    if USE_INITIAL_STATE:
+        p_dh0 = tl.make_block_ptr(dh0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_dh0, b_dh.to(p_dh0.dtype.element_ty), boundary_check=(0, 1))
+def chunk_gated_delta_rule_fwd_h(
+    k: torch.Tensor,
+    w: torch.Tensor,
+    u: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if head_first:
+        B, H, T, K, V = *k.shape, u.shape[-1]
+    else:
+        B, T, H, K, V = *k.shape, u.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if offsets is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        N, NT, chunk_offsets = len(offsets) - 1, len(indices), prepare_chunk_offsets(offsets, BT)
+    BK = triton.next_power_of_2(K)
+    assert BK <= 256, "current kernel does not support head dimension larger than 256."
+    # H100 can have larger block size
+    if check_shared_mem('hopper', k.device.index):
+        BV = 64
+        BC = 64 if K <= 128 else 32
+    # A100
+    elif check_shared_mem('ampere', k.device.index):
+        BV = 32
+        BC = 64
+    else:
+        BV = 32
+        BC = 32 if K <= 128 else 16
+    BC = min(BT, BC)
+    NK = triton.cdiv(K, BK)
+    NV = triton.cdiv(V, BV)
+    assert NK == 1, 'NK > 1 is not supported because it involves time-consuming synchronization'
+    if head_first:
+        h = k.new_empty(B, H, NT, K, V)
+    else:
+        h = k.new_empty(B, NT, H, K, V)
+    final_state = k.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+    v_new = torch.empty_like(u)
+    grid = (NK, NV, N * H)
+    chunk_gated_delta_rule_fwd_kernel_h[grid](
+        k=k,
+        v=u,
+        d=w,
+        v_new=v_new,
+        g=g,
+        h=h,
+        h0=initial_state,
+        ht=final_state,
+        offsets=offsets,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+        BV=BV,
+        NT=NT,
+        HEAD_FIRST=head_first
+    )
+    return h, v_new, final_state
+def chunk_gated_delta_rule_bwd_dhu(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    w: torch.Tensor,
+    g: torch.Tensor,
+    h0: torch.Tensor,
+    dht: Optional[torch.Tensor],
+    do: torch.Tensor,
+    dv: torch.Tensor,
+    scale: float,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if head_first:
+        B, H, T, K, V = *q.shape, do.shape[-1]
+    else:
+        B, T, H, K, V = *q.shape, do.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if offsets is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        N, NT, chunk_offsets = len(offsets) - 1, len(indices), prepare_chunk_offsets(offsets, BT)
+    BK = triton.next_power_of_2(K)
+    assert BK <= 256, "current kernel does not support head dimension being larger than 256."
+    # H100
+    if check_shared_mem('hopper', q.device.index):
+        BV = 64
+        BC = 64 if K <= 128 else 32
+    # A100
+    elif check_shared_mem('ampere', q.device.index):
+        BV = 32
+        BC = 64 if K <= 128 else 32
+    else:
+        BV = 32 if K <= 128 else 16
+        BC = 16
+    BC = min(BT, BC)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, 'NK > 1 is not supported because it involves time-consuming synchronization'
+    if head_first:
+        dh = q.new_empty(B, H, NT, K, V)
+    else:
+        dh = q.new_empty(B, NT, H, K, V)
+    dh0 = torch.empty_like(h0, dtype=torch.float32) if h0 is not None else None
+    dv2 = torch.empty_like(dv)
+    grid = (NK, NV, N * H)
+    chunk_gated_delta_rule_bwd_kernel_dhu[grid](
+        q=q,
+        k=k,
+        d=w,
+        g=g,
+        dht=dht,
+        dh0=dh0,
+        do=do,
+        dh=dh,
+        dv=dv,
+        dv2=dv2,
+        offsets=offsets,
+        chunk_offsets=chunk_offsets,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+        BV=BV,
+        HEAD_FIRST=head_first
+    )
+    return dh, dh0, dv2

fla/ops/common/chunk_h_parallel.py ADDED Viewed

	@@ -0,0 +1,650 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+"""
+Fully parallelized state passing.
+"""
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from fla.ops.utils.op import exp
+@triton.heuristics({
+    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
+    'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64, 128]
+        for BV in [32, 64, 128]
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT', 'USE_G', 'USE_GK', 'USE_GV']
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_fwd_kernel_h_parallel(
+    k,
+    v,
+    h,
+    g,
+    gk,
+    gv,
+    h0,
+    ht,
+    offsets,
+    indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_GK: tl.constexpr,
+    USE_GV: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_kv, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    NV = tl.cdiv(V, BV)
+    # i_b: batch index
+    # i_h: head index
+    # i_n: sequence index
+    # i_t: chunk index within current sequence
+    # i_tg: (global) chunk index across all sequences
+    i_k, i_v = i_kv // NV, i_kv % NV
+    i_b, i_h = i_bh // H, i_bh % H
+    if USE_OFFSETS:
+        i_tg = i_t
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        bos, eos = i_b * T, i_b * T + T
+        NT = tl.cdiv(T, BT)
+        i_n, i_tg = i_b, i_b * NT + i_t
+    i_nh = i_n * H + i_h
+    if HEAD_FIRST:
+        p_k = tl.make_block_ptr(k + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_h = tl.make_block_ptr(h + (i_bh * NT + i_t) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+    else:
+        p_k = tl.make_block_ptr(k + (bos*H + i_h) * K, (K, T), (1, H*K), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        p_v = tl.make_block_ptr(v + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_h = tl.make_block_ptr(h + (i_tg * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+    if i_t == 0:
+        if USE_INITIAL_STATE:
+            p_h0 = tl.make_block_ptr(h0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+            b_h = tl.load(p_h0, boundary_check=(0, 1)).to(tl.float32)
+        else:
+            b_h = tl.zeros([BK, BV], dtype=tl.float32)
+        tl.store(p_h, b_h.to(p_h.dtype.element_ty), boundary_check=(0, 1))
+    # [BK, BT]
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    # [BT, BV]
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+    last_idx = min(i_t * BT + BT, T) - 1
+    # scalar decay
+    if USE_G:
+        if HEAD_FIRST:
+            b_g_last = tl.load(g + i_bh * T + last_idx)
+            p_g = g + i_bh * T + i_t * BT + tl.arange(0, BT)
+            p_g = tl.max_contiguous(tl.multiple_of(p_g, BT), BT)
+        else:
+            b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
+            p_g = g + bos*H + (i_t * BT + tl.arange(0, BT)) * H + i_h
+        b_g = tl.load(p_g, mask=(i_t * BT + tl.arange(0, BT) < T), other=0.)
+        b_v = (b_v * exp(b_g_last - b_g)[:, None]).to(b_v.dtype)
+    # vector decay, h = Diag(gk) @ h
+    if USE_GK:
+        if HEAD_FIRST:
+            p_gk = tl.make_block_ptr(gk + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+            p_gk_last = gk + i_bh * T*K + last_idx * K + i_k * BK + tl.arange(0, BK)
+            p_gk_last = tl.max_contiguous(tl.multiple_of(p_gk_last, BK), BK)
+        else:
+            p_gk = tl.make_block_ptr(gk + (bos*H + i_h) * K, (K, T), (1, H*K), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+            p_gk_last = gk + (bos + last_idx) * H*K + i_h * K + i_k * BK + tl.arange(0, BK)
+        b_gk_last = tl.load(p_gk_last, mask=(i_k * BK + tl.arange(0, BK) < K), other=0.)
+        b_gk = tl.load(p_gk, boundary_check=(0, 1))
+        b_k = (b_k * exp(b_gk_last[:, None] - b_gk)).to(b_k.dtype)
+    # vector decay, h = h @ Diag(gv)
+    if USE_GV:
+        if HEAD_FIRST:
+            p_gv = tl.make_block_ptr(gv + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_gv_last = gv + i_bh * T*V + last_idx * V + i_v * BV + tl.arange(0, BV)
+            p_gv_last = tl.max_contiguous(tl.multiple_of(p_gv_last, BV), BV)
+        else:
+            p_gv = tl.make_block_ptr(gv + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_gv_last = gv + (bos + last_idx) * H*V + i_h * V + i_v * BV + tl.arange(0, BV)
+        b_gv_last = tl.load(p_gv_last, mask=(i_v * BV + tl.arange(0, BV) < V), other=0.)
+        b_gv = tl.load(p_gv, boundary_check=(0, 1))
+        b_v = (b_v * exp(b_gv_last[None, :] - b_gv)).to(b_v.dtype)
+    b_h = tl.dot(b_k, b_v)
+    if i_t < NT - 1:
+        if HEAD_FIRST:
+            p_h = tl.make_block_ptr(h + (i_bh * NT + i_t + 1) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        else:
+            p_h = tl.make_block_ptr(h + ((i_tg + 1) * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_h, b_h.to(p_h.dtype.element_ty), boundary_check=(0, 1))
+    elif STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64, 128]
+        for BV in [32, 64, 128]
+        for num_warps in [2, 4, 8, 16]
+        for num_stages in [2, 3]
+    ],
+    key=['BT', 'USE_G', 'USE_GK', 'USE_GV']
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_fwd_kernel_h_reduction(
+    h,
+    g,
+    gk,
+    gv,
+    kvt,
+    ht,
+    offsets,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_GK: tl.constexpr,
+    USE_GV: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_h = i_nh // H, i_nh % H
+    if USE_OFFSETS:
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+    # [BK, BV]
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    for i_t in range(NT):
+        if HEAD_FIRST:
+            p_h = tl.make_block_ptr(h + (i_nh * NT + i_t) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        else:
+            p_h = tl.make_block_ptr(h + ((boh + i_t) * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_h += tl.load(p_h, boundary_check=(0, 1)).to(tl.float32)
+        if i_t > 0:
+            tl.store(p_h, b_h.to(p_h.dtype.element_ty), boundary_check=(0, 1))
+        last_idx = min(i_t * BT + BT, T) - 1
+        # scalar decay
+        if USE_G:
+            if HEAD_FIRST:
+                b_g_last = tl.load(g + i_nh * T + last_idx)
+            else:
+                b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
+            b_h *= exp(b_g_last)
+        # vector decay, h = Diag(gk) @ h
+        if USE_GK:
+            if HEAD_FIRST:
+                p_gk_last = gk + i_nh * T*K + last_idx * K + i_k * BK + tl.arange(0, BK)
+                p_gk_last = tl.max_contiguous(tl.multiple_of(p_gk_last, BK), BK)
+            else:
+                p_gk_last = gk + (bos + last_idx) * H*K + i_h * K + i_k * BK + tl.arange(0, BK)
+            b_gk_last = tl.load(p_gk_last, mask=(i_k * BK + tl.arange(0, BK) < K), other=0.)
+            b_h *= exp(b_gk_last)[:, None]
+        # vector decay, h = h @ Diag(gv)
+        if USE_GV:
+            if HEAD_FIRST:
+                p_gv_last = gv + i_nh * T*V + last_idx * V + i_v * BV + tl.arange(0, BV)
+                p_gv_last = tl.max_contiguous(tl.multiple_of(p_gv_last, BV), BV)
+            else:
+                p_gv_last = gv + (bos + last_idx) * H*V + i_h * V + i_v * BV + tl.arange(0, BV)
+            b_gv_last = tl.load(p_gv_last, mask=(i_v * BV + tl.arange(0, BV) < V), other=0.)
+            b_h *= exp(b_gv_last)[None, :]
+    if STORE_FINAL_STATE:
+        p_kvt = tl.make_block_ptr(kvt + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        p_ht = tl.make_block_ptr(ht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_h += tl.load(p_kvt, boundary_check=(0, 1)).to(tl.float32)
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'STORE_INITIAL_STATE_GRADIENT': lambda args: args['dh0'] is not None,
+    'USE_FINAL_STATE_GRADIENT': lambda args: args['dht'] is not None,
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64, 128]
+        for BV in [32, 64, 128]
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT', 'USE_G', 'USE_GK', 'USE_GV']
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_bwd_kernel_dh_parallel(
+    q,
+    g,
+    gk,
+    gv,
+    do,
+    dh,
+    dht,
+    dh0,
+    offsets,
+    indices,
+    scale,
+    T,
+    HQ: tl.constexpr,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    NG: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_GK: tl.constexpr,
+    USE_GV: tl.constexpr,
+    STORE_INITIAL_STATE_GRADIENT: tl.constexpr,
+    USE_FINAL_STATE_GRADIENT: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_kv, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    NV = tl.cdiv(V, BV)
+    i_k, i_v = i_kv // NV, i_kv % NV
+    i_b, i_hq, i_bg = i_bh // HQ, i_bh % HQ, i_bh // NG
+    i_h = i_hq // NG
+    if USE_OFFSETS:
+        i_tg = i_t
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        bos, eos = i_b * T, i_b * T + T
+        NT = tl.cdiv(T, BT)
+        i_n, i_tg = i_b, i_b * NT + i_t
+    i_nh = i_n * HQ + i_hq
+    if HEAD_FIRST:
+        p_q = tl.make_block_ptr(q + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_dh = tl.make_block_ptr(dh + (i_bh * NT + i_t) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+    else:
+        p_q = tl.make_block_ptr(q + (bos*HQ + i_hq) * K, (K, T), (1, HQ*K), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        p_do = tl.make_block_ptr(do + (bos*HQ + i_hq) * V, (T, V), (HQ*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_dh = tl.make_block_ptr(dh + (i_tg * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+    if i_t == NT - 1:
+        if USE_FINAL_STATE_GRADIENT:
+            p_dht = tl.make_block_ptr(dht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+            b_dh = tl.load(p_dht, boundary_check=(0, 1)).to(tl.float32)
+        else:
+            b_dh = tl.zeros([BK, BV], dtype=tl.float32)
+        tl.store(p_dh, b_dh.to(p_dh.dtype.element_ty), boundary_check=(0, 1))
+    # [BK, BT]
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_q = (b_q * scale).to(b_q.dtype)
+    # [BT, BV]
+    b_do = tl.load(p_do, boundary_check=(0, 1))
+    if USE_G:
+        if HEAD_FIRST:
+            p_g = g + i_bg * T + i_t * BT + tl.arange(0, BT)
+            p_g = tl.max_contiguous(tl.multiple_of(p_g, BT), BT)
+        else:
+            p_g = g + (bos + i_t * BT + tl.arange(0, BT)) * H + i_h
+        b_g = tl.load(p_g, mask=(i_t * BT + tl.arange(0, BT) < T), other=0.)
+        b_q = (b_q * exp(b_g)[None, :]).to(b_q.dtype)
+    if USE_GK:
+        if HEAD_FIRST:
+            p_gk = tl.make_block_ptr(gk + i_bg * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        else:
+            p_gk = tl.make_block_ptr(gk + (bos*H + i_h) * K, (K, T), (1, H*K), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        b_gk = tl.load(p_gk, boundary_check=(0, 1))
+        b_q = (b_q * exp(b_gk)).to(b_q.dtype)
+    if USE_GV:
+        if HEAD_FIRST:
+            p_gv = tl.make_block_ptr(gv + i_bg * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        else:
+            p_gv = tl.make_block_ptr(gv + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_gv = tl.load(p_gv, boundary_check=(0, 1))
+        b_do = (b_do * exp(b_gv)).to(b_do.dtype)
+    b_dh = tl.dot(b_q, b_do)
+    if i_t > 0:
+        if HEAD_FIRST:
+            p_dh = tl.make_block_ptr(dh + (i_bh * NT + i_t - 1) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        else:
+            p_dh = tl.make_block_ptr(dh + ((i_tg - 1) * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_dh, b_dh.to(p_dh.dtype.element_ty), boundary_check=(0, 1))
+    elif STORE_INITIAL_STATE_GRADIENT:
+        p_dh0 = tl.make_block_ptr(dh0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_dh0, b_dh.to(p_dh0.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'STORE_INITIAL_STATE_GRADIENT': lambda args: args['dh0'] is not None,
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64, 128]
+        for BV in [32, 64, 128]
+        for num_warps in [2, 4, 8, 16]
+        for num_stages in [2, 3]
+    ],
+    key=['BT', 'USE_G', 'USE_GK', 'USE_GV']
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_bwd_kernel_dh_reduction(
+    g,
+    gk,
+    gv,
+    dh,
+    doq0,
+    dh0,
+    offsets,
+    chunk_offsets,
+    T,
+    HQ: tl.constexpr,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    NG: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_GK: tl.constexpr,
+    USE_GV: tl.constexpr,
+    STORE_INITIAL_STATE_GRADIENT: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_bg = i_nh // NG
+    i_n, i_hq = i_nh // HQ, i_nh % HQ
+    i_h = i_hq // NG
+    if USE_OFFSETS:
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+    # [BK, BV]
+    b_dh = tl.zeros([BK, BV], dtype=tl.float32)
+    for i_t in range(NT - 1, -1, -1):
+        if HEAD_FIRST:
+            p_dh = tl.make_block_ptr(dh + (i_nh * NT + i_t) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        else:
+            p_dh = tl.make_block_ptr(dh + ((boh+i_t) * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_dh += tl.load(p_dh, boundary_check=(0, 1)).to(tl.float32)
+        if i_t < NT - 1:
+            tl.store(p_dh, b_dh.to(p_dh.dtype.element_ty), boundary_check=(0, 1))
+        last_idx = min(i_t * BT + BT, T) - 1
+        if USE_G:
+            if HEAD_FIRST:
+                b_g_last = tl.load(g + i_bg * T + last_idx)
+            else:
+                b_g_last = tl.load(g + (bos + last_idx) * H + i_h)
+            b_dh *= exp(b_g_last)
+        if USE_GK:
+            if HEAD_FIRST:
+                p_gk_last = gk + (i_bg * T + last_idx) * K + i_k * BK + tl.arange(0, BK)
+                p_gk_last = tl.max_contiguous(tl.multiple_of(p_gk_last, BK), BK)
+            else:
+                p_gk_last = gk + (bos + last_idx) * H*K + i_h * K + i_k * BK + tl.arange(0, BK)
+            b_gk_last = tl.load(p_gk_last, mask=(i_k * BK + tl.arange(0, BK) < K), other=0.)
+            b_dh *= exp(b_gk_last)[:, None]
+        if USE_GV:
+            if HEAD_FIRST:
+                p_gv_last = gv + (i_bg * T + last_idx) * V + i_v * BV + tl.arange(0, BV)
+                p_gv_last = tl.max_contiguous(tl.multiple_of(p_gv_last, BV), BV)
+            else:
+                p_gv_last = gv + (bos + last_idx) * H*V + i_h * V + i_v * BV + tl.arange(0, BV)
+            b_gv_last = tl.load(p_gv_last, mask=(i_v * BV + tl.arange(0, BV) < V), other=0.)
+            b_dh *= exp(b_gv_last)[None, :]
+    if STORE_INITIAL_STATE_GRADIENT:
+        p_doq0 = tl.make_block_ptr(doq0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        p_dh0 = tl.make_block_ptr(dh0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_dh += tl.load(p_doq0, boundary_check=(0, 1)).to(tl.float32)
+        tl.store(p_dh0, b_dh.to(p_dh0.dtype.element_ty), boundary_check=(0, 1))
+def chunk_fwd_h(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    gk: torch.Tensor,
+    gv: torch.Tensor,
+    h0: torch.Tensor,
+    output_final_state: bool,
+    states_in_fp32: bool = False,
+    offsets: Optional[torch.Tensor] = None,
+    indices: Optional[torch.Tensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if head_first:
+        B, H, T, K, V = *k.shape, v.shape[-1]
+    else:
+        B, T, H, K, V = *k.shape, v.shape[-1]
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if offsets is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        if indices is None:
+            indices = torch.cat([torch.arange(n) for n in triton.cdiv(offsets[1:] - offsets[:-1], BT).tolist()])
+            indices = torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(offsets)
+        N, NT = len(offsets) - 1, len(indices)
+        chunk_offsets = torch.cat([offsets.new_tensor([0]), triton.cdiv(offsets[1:] - offsets[:-1], BT)]).cumsum(-1)
+    h = k.new_empty(B, H, NT, K, V, dtype=torch.float) if head_first else k.new_empty(B, NT, H, K, V, dtype=torch.float)
+    ht = k.new_empty(N, H, K, V, dtype=torch.float) if output_final_state else None
+    def grid(meta): return (triton.cdiv(K, meta['BK']) * triton.cdiv(V, meta['BV']), NT, B * H)
+    chunk_fwd_kernel_h_parallel[grid](
+        k=k,
+        v=v,
+        h=h,
+        g=g,
+        gk=gk,
+        gv=gv,
+        h0=h0,
+        ht=ht,
+        offsets=offsets,
+        indices=indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        USE_G=g is not None,
+        USE_GK=gk is not None,
+        USE_GV=gv is not None,
+        HEAD_FIRST=head_first
+    )
+    kvt, ht = ht, (torch.empty_like(ht) if output_final_state else None)
+    def grid(meta): return (triton.cdiv(K, meta['BK']), triton.cdiv(V, meta['BV']), N * H)
+    chunk_fwd_kernel_h_reduction[grid](
+        h=h,
+        g=g,
+        gk=gk,
+        gv=gv,
+        kvt=kvt,
+        ht=ht,
+        offsets=offsets,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        USE_G=g is not None,
+        USE_GK=gk is not None,
+        USE_GV=gv is not None,
+        HEAD_FIRST=head_first
+    )
+    h = h.to(k.dtype) if not states_in_fp32 else h
+    return h, ht
+def chunk_bwd_dh(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    gk: torch.Tensor,
+    gv: torch.Tensor,
+    do: torch.Tensor,
+    h0: torch.Tensor,
+    dht: torch.Tensor,
+    scale: float,
+    states_in_fp32: bool = False,
+    offsets: Optional[torch.Tensor] = None,
+    indices: Optional[torch.Tensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if head_first:
+        B, H, T, K, V = *k.shape, v.shape[-1]
+        HQ = q.shape[1]
+    else:
+        B, T, H, K, V = *k.shape, v.shape[-1]
+        HQ = q.shape[2]
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    # NG: number of groups in GQA
+    if offsets is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        if indices is None:
+            indices = torch.cat([torch.arange(n) for n in triton.cdiv(offsets[1:] - offsets[:-1], BT).tolist()])
+            indices = torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(offsets)
+        N, NT = len(offsets) - 1, len(indices)
+        chunk_offsets = torch.cat([offsets.new_tensor([0]), triton.cdiv(offsets[1:] - offsets[:-1], BT)]).cumsum(-1)
+    NG = HQ // H
+    if head_first:
+        dh = k.new_empty(B, HQ, NT, K, V, dtype=k.dtype if not states_in_fp32 else torch.float)
+    else:
+        dh = k.new_empty(B, NT, HQ, K, V, dtype=k.dtype if not states_in_fp32 else torch.float)
+    dh0 = torch.empty_like(h0, dtype=torch.float) if h0 is not None else None
+    def grid(meta): return (triton.cdiv(K, meta['BK']) * triton.cdiv(V, meta['BV']), NT, B * HQ)
+    chunk_bwd_kernel_dh_parallel[grid](
+        q=q,
+        g=g,
+        gk=gk,
+        gv=gv,
+        do=do,
+        dh=dh,
+        dht=dht,
+        dh0=dh0,
+        offsets=offsets,
+        indices=indices,
+        scale=scale,
+        T=T,
+        HQ=HQ,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        NG=NG,
+        USE_G=g is not None,
+        USE_GK=gk is not None,
+        USE_GV=gv is not None,
+        HEAD_FIRST=head_first
+    )
+    doq0, dh0 = dh0, (torch.empty_like(dh0) if dh0 is not None else None)
+    def grid(meta): return (triton.cdiv(K, meta['BK']), triton.cdiv(V, meta['BV']), N * HQ)
+    chunk_bwd_kernel_dh_reduction[grid](
+        g=g,
+        gk=gk,
+        gv=gv,
+        dh=dh,
+        doq0=doq0,
+        dh0=dh0,
+        offsets=offsets,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        HQ=HQ,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        NG=NG,
+        USE_G=g is not None,
+        USE_GK=gk is not None,
+        USE_GV=gv is not None,
+        HEAD_FIRST=head_first
+    )
+    dh = dh.to(q.dtype) if not states_in_fp32 else dh
+    return dh, dh0

fla/ops/delta_rule/__pycache__/chunk.cpython-312.pyc ADDED Viewed

Binary file (13.3 kB). View file

fla/ops/delta_rule/__pycache__/fused_chunk.cpython-312.pyc ADDED Viewed

Binary file (388 Bytes). View file

fla/ops/delta_rule/__pycache__/fused_recurrent.cpython-312.pyc ADDED Viewed

Binary file (34 kB). View file

fla/ops/gated_delta_rule/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (315 Bytes). View file

fla/ops/generalized_delta_rule/dplr/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .chunk import chunk_dplr_delta_rule
+from .fused_recurrent import fused_recurrent_dplr_delta_rule
+__all__ = [
+    'chunk_dplr_delta_rule',
+    'fused_recurrent_dplr_delta_rule'
+]

fla/ops/generalized_delta_rule/dplr/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (324 Bytes). View file

fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_h_bwd.cpython-312.pyc ADDED Viewed

Binary file (12.2 kB). View file

fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_h_fwd.cpython-312.pyc ADDED Viewed

Binary file (12.5 kB). View file

fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_o_bwd.cpython-312.pyc ADDED Viewed

Binary file (28 kB). View file

fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_o_fwd.cpython-312.pyc ADDED Viewed

Binary file (8.91 kB). View file

fla/ops/generalized_delta_rule/dplr/__pycache__/fused_recurrent.cpython-312.pyc ADDED Viewed

Binary file (14.5 kB). View file

fla/ops/generalized_delta_rule/dplr/chunk_A_fwd.py ADDED Viewed

	@@ -0,0 +1,324 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional
+import torch
+import triton
+import triton.language as tl
+from fla.ops.utils.op import exp, gather
+from fla.utils import is_gather_supported, use_cuda_graph
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BK': BK}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64]
+        for num_warps in [2, 4, 8, 16]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BC', 'K'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_dplr_fwd_A_kernel_intra_sub_inter(
+    q,
+    k,
+    a,
+    b,
+    gi,  # cumsum
+    ge,  # before cumsum
+    Aqk,
+    Aqb,
+    Aab,
+    Aak,
+    offsets,
+    indices,
+    scale: tl.constexpr,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    NC: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_t, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    i_i, i_j = i_c // NC, i_c % NC
+    if USE_OFFSETS:
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    if i_t * BT + i_i * BC >= T:
+        return
+    if i_i <= i_j:
+        return
+    b_Aqk = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aqb = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aab = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aak = tl.zeros([BC, BC], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        o_k = i_k * BK + tl.arange(0, BK)
+        m_k = o_k < K
+        if HEAD_FIRST:
+            p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0))
+            p_a = tl.make_block_ptr(a + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0))
+            p_gq_i = tl.make_block_ptr(gi + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0))
+            p_gq_e = tl.make_block_ptr(ge + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0))
+            p_k = tl.make_block_ptr(k + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1))
+            p_b = tl.make_block_ptr(b + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1))
+            p_gk = tl.make_block_ptr(gi + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1))
+            p_gn = tl.max_contiguous(tl.multiple_of(gi + (i_bh * T + i_t * BT + i_i * BC - 1) * K + o_k, BK), BK)
+        else:
+            p_q = tl.make_block_ptr(q + (bos*H+i_h)*K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0))
+            p_a = tl.make_block_ptr(a + (bos*H+i_h)*K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0))
+            p_gq_i = tl.make_block_ptr(gi + (bos*H+i_h)*K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0))
+            p_gq_e = tl.make_block_ptr(ge + (bos*H+i_h)*K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0))
+            p_k = tl.make_block_ptr(k + (bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1))
+            p_b = tl.make_block_ptr(b + (bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1))
+            p_gk = tl.make_block_ptr(gi + (bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1))
+            p_gn = gi + (bos + i_t * BT + i_i * BC - 1) * H*K + i_h * K + o_k
+        # [BK,]
+        b_gn = tl.load(p_gn, mask=m_k, other=0).to(tl.float32)
+        # [BC, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        b_a = tl.load(p_a, boundary_check=(0, 1))
+        b_gq_i = tl.load(p_gq_i, boundary_check=(0, 1))
+        b_gq_e = tl.load(p_gq_e, boundary_check=(0, 1))
+        b_ag = b_a * exp(b_gq_e - b_gn[None, :])
+        b_qg = b_q * exp(b_gq_i - b_gn[None, :]) * scale
+        # [BK, BC]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_b = tl.load(p_b, boundary_check=(0, 1))
+        b_gk = tl.load(p_gk, boundary_check=(0, 1)).to(tl.float32)
+        tmp = exp(b_gn[:, None] - b_gk)
+        b_kg = b_k * tmp
+        b_bg = b_b * tmp
+        # [BC, BC] using tf32 to improve precision here.
+        b_Aab += tl.dot(b_ag, b_bg)
+        b_Aak += tl.dot(b_ag, b_kg)
+        b_Aqk += tl.dot(b_qg, b_kg)
+        b_Aqb += tl.dot(b_qg, b_bg)
+    if HEAD_FIRST:
+        p_Aqk = tl.make_block_ptr(Aqk + i_bh*T*BT, (T, BT), (BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0))
+        p_Aqb = tl.make_block_ptr(Aqb + i_bh*T*BT, (T, BT), (BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0))
+        p_Aab = tl.make_block_ptr(Aab + i_bh*T*BT, (T, BT), (BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0))
+        p_Aak = tl.make_block_ptr(Aak + i_bh*T*BT, (T, BT), (BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0))
+    else:
+        p_Aqk = tl.make_block_ptr(Aqk + (bos*H+i_h)*BT, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0))
+        p_Aqb = tl.make_block_ptr(Aqb + (bos*H+i_h)*BT, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0))
+        p_Aab = tl.make_block_ptr(Aab + (bos*H+i_h)*BT, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0))
+        p_Aak = tl.make_block_ptr(Aak + (bos*H+i_h)*BT, (T, BT), (H*BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0))
+    tl.store(p_Aqk, b_Aqk.to(Aqk.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    tl.store(p_Aqb, b_Aqb.to(Aqb.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    tl.store(p_Aab, b_Aab.to(Aab.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    tl.store(p_Aak, b_Aak.to(Aak.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BK', 'BT'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_dplr_fwd_A_kernel_intra_sub_intra(
+    q,
+    k,
+    a,
+    b,
+    gi,
+    ge,
+    qg,
+    kg,
+    ag,
+    bg,
+    Aqk,
+    Aqb,
+    Aab,
+    Aak,
+    offsets,
+    indices,
+    scale: tl.constexpr,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    NC: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+    GATHER_SUPPORTED: tl.constexpr
+):
+    i_t, i_i, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    i_j = i_i
+    if USE_OFFSETS:
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    if i_t * BT + i_i * BC >= T:
+        return
+    o_i = tl.arange(0, BC)
+    o_k = tl.arange(0, BK)
+    m_k = o_k < K
+    m_A = (i_t * BT + i_i * BC + tl.arange(0, BC)) < T
+    last_idx = min((i_t+1) * BT, T) - 1
+    if HEAD_FIRST:
+        o_A = i_bh * T*BT + (i_t * BT + i_i * BC + tl.arange(0, BC)) * BT + i_j * BC
+        p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_a = tl.make_block_ptr(a + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_b = tl.make_block_ptr(b + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_gi = tl.make_block_ptr(gi + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_ge = tl.make_block_ptr(ge + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_g_last = gi + i_bh * T*K + last_idx * K + tl.arange(0, BK)
+        b_g_last = tl.load(p_g_last, mask=m_k, other=0)
+        p_qg = tl.make_block_ptr(qg + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_kg = tl.make_block_ptr(kg + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_ag = tl.make_block_ptr(ag + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_bg = tl.make_block_ptr(bg + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+    else:
+        o_A = (bos + i_t * BT + i_i * BC + tl.arange(0, BC)) * H*BT + i_h * BT + i_j * BC
+        p_q = tl.make_block_ptr(q + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_a = tl.make_block_ptr(a + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_b = tl.make_block_ptr(b + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_gi = tl.make_block_ptr(gi + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_ge = tl.make_block_ptr(ge + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_g_last = gi + (bos * H + i_h) * K + last_idx * H * K + tl.arange(0, BK)
+        b_g_last = tl.load(p_g_last, mask=m_k, other=0)
+        p_qg = tl.make_block_ptr(qg + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_kg = tl.make_block_ptr(kg + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_ag = tl.make_block_ptr(ag + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+        p_bg = tl.make_block_ptr(bg + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT + i_i * BC, 0), (BC, BK), (1, 0))
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_q = b_q * scale
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    b_a = tl.load(p_a, boundary_check=(0, 1))
+    b_b = tl.load(p_b, boundary_check=(0, 1))
+    b_gi = tl.load(p_gi, boundary_check=(0, 1)).to(tl.float32)
+    b_ge = tl.load(p_ge, boundary_check=(0, 1)).to(tl.float32)
+    # deal with decay term.
+    g_exp = exp(b_gi)
+    g_exp_inv = exp(-b_gi + b_g_last[None, :])
+    b_qg = b_q * g_exp
+    b_kg = b_k * g_exp_inv
+    b_bg = b_b * g_exp_inv
+    b_ag = b_a * exp(b_ge)
+    tl.store(p_qg, b_qg.to(p_qg.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    tl.store(p_bg, b_bg.to(p_bg.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    tl.store(p_ag, b_ag.to(p_ag.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    tl.store(p_kg, b_kg.to(p_kg.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    # tl.debug_barrier()
+    b_q = b_q.to(b_k.dtype)
+    # inner attn
+    for j in range(0, min(BC, T - i_t * BT - i_i * BC)):
+        # a trick to index the j-th row of b_k, b_g, b_b
+        if GATHER_SUPPORTED:
+            row_idx = tl.full([1, BK], j, dtype=tl.int16)
+            # [1, BK]
+            b_k_j = gather(b_k, row_idx, axis=0)
+            b_gk_j = gather(b_gi, row_idx, axis=0)
+            b_b_j = gather(b_b, row_idx, axis=0)
+        else:
+            mask = tl.arange(0, BC) == j
+            b_k_j = tl.sum(tl.where(mask[:, None], b_k, 0), 0)[None, :]
+            b_gk_j = tl.sum(tl.where(mask[:, None], b_gi, 0), 0)[None, :]
+            b_b_j = tl.sum(tl.where(mask[:, None], b_b, 0), 0)[None, :]
+        mask = tl.arange(0, BC) == j
+        tmp = exp(b_gi - b_gk_j)
+        b_A_qk = tl.sum(b_q * b_k_j * tmp, 1)
+        b_A_qk = tl.where(o_i >= j, b_A_qk, 0.)
+        b_A_qb = tl.sum(b_q * b_b_j * tmp, 1)
+        b_A_qb = tl.where(o_i >= j, b_A_qb, 0.)
+        tmp2 = exp(b_ge - b_gk_j)
+        b_A_ak = tl.sum(b_a * b_k_j * tmp2, 1)
+        b_A_ak = tl.where(o_i > j, b_A_ak, 0.)
+        b_A_ab = tl.sum(b_a * b_b_j * tmp2, 1)
+        b_A_ab = tl.where(o_i > j, b_A_ab, 0.)
+        tl.store(Aqk + o_A + j, b_A_qk.to(dtype=Aqk.dtype.element_ty, fp_downcast_rounding="rtne"), mask=m_A)
+        tl.store(Aqb + o_A + j, b_A_qb.to(dtype=Aqb.dtype.element_ty, fp_downcast_rounding="rtne"), mask=m_A)
+        tl.store(Aab + o_A + j, b_A_ab.to(dtype=Aqb.dtype.element_ty, fp_downcast_rounding="rtne"), mask=m_A)
+        tl.store(Aak + o_A + j, b_A_ak.to(dtype=Aqk.dtype.element_ty, fp_downcast_rounding="rtne"), mask=m_A)
+def chunk_fwd_intra_dplr_fn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    gi: torch.Tensor,
+    ge: torch.Tensor,
+    scale: float,
+    chunk_size: int,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+):
+    if head_first:
+        B, H, T, K = k.shape
+    else:
+        B, T, H, K = k.shape
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    NT = triton.cdiv(T, BT) if offsets is None else len(indices)
+    BC = min(16, BT)
+    NC = triton.cdiv(BT, BC)
+    Aqk = q.new_empty(B, *((H, T) if head_first else (T, H)), BT, dtype=q.dtype)
+    Aqb = q.new_empty(B, *((H, T) if head_first else (T, H)), BT, dtype=q.dtype)
+    # involving matrix inverse and it'd be better to use float here.
+    Aab = q.new_empty(B, *((H, T) if head_first else (T, H)), BT, dtype=torch.float)
+    Aak = q.new_empty(B, *((H, T) if head_first else (T, H)), BT, dtype=torch.float)
+    grid = (NT, NC * NC, B * H)
+    chunk_dplr_fwd_A_kernel_intra_sub_inter[grid](
+        q=q, k=k, a=a, b=b, gi=gi, ge=ge, Aqk=Aqk, Aqb=Aqb, Aab=Aab, Aak=Aak,
+        offsets=offsets, indices=indices,
+        scale=scale,
+        T=T, H=H, K=K, BT=BT, BC=BC, NC=NC,
+        HEAD_FIRST=head_first
+    )
+    grid = (NT, NC, B * H)
+    BK = triton.next_power_of_2(K)
+    qg = torch.empty_like(q)
+    kg = torch.empty_like(k, dtype=q.dtype)
+    ag = torch.empty_like(a, dtype=q.dtype)
+    bg = torch.empty_like(b, dtype=q.dtype)
+    chunk_dplr_fwd_A_kernel_intra_sub_intra[grid](
+        q=q, k=k, a=a, b=b, gi=gi, ge=ge, Aqk=Aqk, Aqb=Aqb, Aab=Aab, Aak=Aak,
+        qg=qg, kg=kg, ag=ag, bg=bg,
+        offsets=offsets, indices=indices,
+        scale=scale,
+        T=T, H=H, K=K, BT=BT, BC=BC, BK=BK, HEAD_FIRST=head_first, NC=NC,
+        GATHER_SUPPORTED=is_gather_supported
+    )
+    return Aab, Aqk, Aak, Aqb, qg, kg, ag, bg

fla/ops/generalized_delta_rule/dplr/chunk_h_bwd.py ADDED Viewed

	@@ -0,0 +1,196 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from fla.ops.common.utils import prepare_chunk_offsets
+from fla.ops.utils.op import exp
+from fla.utils import check_shared_mem, use_cuda_graph
+@triton.heuristics({
+    'USE_FINAL_STATE_GRADIENT': lambda args: args['dht'] is not None,
+    'USE_INITIAL_STATE': lambda args: args['dh0'] is not None,
+    'USE_OFFSETS': lambda args: args['offsets'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT', 'BK', 'BV', "V"],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_dplr_bwd_kernel_dhu(
+    qg,
+    bg,
+    w,
+    gk,
+    dht,
+    dh0,
+    do,
+    dh,
+    dv,
+    dv2,
+    offsets,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_FINAL_STATE_GRADIENT: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_h = i_nh // H, i_nh % H
+    if USE_OFFSETS:
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+    # [BK, BV]
+    b_dh = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_FINAL_STATE_GRADIENT:
+        p_dht = tl.make_block_ptr(dht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_dh += tl.load(p_dht, boundary_check=(0, 1))
+    mask_k = tl.arange(0, BK) < K
+    for i_t in range(NT - 1, -1, -1):
+        if HEAD_FIRST:
+            p_dh = tl.make_block_ptr(dh + (i_nh * NT + i_t) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        else:
+            p_dh = tl.make_block_ptr(dh + ((boh+i_t) * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_dh, b_dh.to(p_dh.dtype.element_ty), boundary_check=(0, 1))
+        b_dh_tmp = tl.zeros([BK, BV], dtype=tl.float32)
+        for i_c in range(tl.cdiv(BT, BC) - 1, -1, -1):
+            if HEAD_FIRST:
+                p_qg = tl.make_block_ptr(qg + i_nh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_bg = tl.make_block_ptr(bg + i_nh * T*K, (T, K), (K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+                p_w = tl.make_block_ptr(w + i_nh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_dv = tl.make_block_ptr(dv + i_nh * T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_do = tl.make_block_ptr(do + i_nh * T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_dv2 = tl.make_block_ptr(dv2 + i_nh * T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            else:
+                p_qg = tl.make_block_ptr(qg+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_bg = tl.make_block_ptr(bg+(bos*H+i_h)*K, (T, K), (H*K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+                p_w = tl.make_block_ptr(w+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_dv = tl.make_block_ptr(dv+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_do = tl.make_block_ptr(do+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_dv2 = tl.make_block_ptr(dv2+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            # [BK, BT]
+            b_qg = tl.load(p_qg, boundary_check=(0, 1))
+            # [BT, BK]
+            b_bg = tl.load(p_bg, boundary_check=(0, 1))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            # [BT, V]
+            b_do = tl.load(p_do, boundary_check=(0, 1))
+            b_dv = tl.load(p_dv, boundary_check=(0, 1))
+            b_dv2 = b_dv + tl.dot(b_bg, b_dh.to(b_bg.dtype))
+            tl.store(p_dv2, b_dv2.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+            # [BK, BV]
+            b_dh_tmp += tl.dot(b_qg, b_do.to(b_qg.dtype))
+            b_dh_tmp += tl.dot(b_w, b_dv2.to(b_qg.dtype))
+        last_idx = min((i_t + 1) * BT, T) - 1
+        if HEAD_FIRST:
+            bg_last = tl.load(gk + (i_nh * T + last_idx) * K + tl.arange(0, BK), mask=mask_k)
+        else:
+            bg_last = tl.load(gk + ((bos + last_idx) * H + i_h) * K + tl.arange(0, BK), mask=mask_k)
+        b_dh *= exp(bg_last)[:, None]
+        b_dh += b_dh_tmp
+    if USE_INITIAL_STATE:
+        p_dh0 = tl.make_block_ptr(dh0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_dh0, b_dh.to(p_dh0.dtype.element_ty), boundary_check=(0, 1))
+def chunk_dplr_bwd_dhu(
+    qg: torch.Tensor,
+    bg: torch.Tensor,
+    w: torch.Tensor,
+    gk: torch.Tensor,
+    h0: torch.Tensor,
+    dht: Optional[torch.Tensor],
+    do: torch.Tensor,
+    dv: torch.Tensor,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if head_first:
+        B, H, T, K, V = *qg.shape, do.shape[-1]
+    else:
+        B, T, H, K, V = *qg.shape, do.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    BK = triton.next_power_of_2(K)
+    assert BK <= 256, "current kernel does not support head dimension being larger than 256."
+    # H100
+    if check_shared_mem('hopper', qg.device.index):
+        BV = 64
+        BC = 64 if K <= 128 else 32
+    elif check_shared_mem('ampere', qg.device.index):  # A100
+        BV = 32
+        BC = 32
+    else:  # Etc: 4090
+        BV = 16
+        BC = 16
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if offsets is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        N, NT, chunk_offsets = len(offsets) - 1, len(indices), prepare_chunk_offsets(offsets, BT)
+    BC = min(BT, BC)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, 'NK > 1 is not supported because it involves time-consuming synchronization'
+    if head_first:
+        dh = qg.new_empty(B, H, NT, K, V)
+    else:
+        dh = qg.new_empty(B, NT, H, K, V)
+    dh0 = torch.empty_like(h0, dtype=torch.float32) if h0 is not None else None
+    dv2 = torch.zeros_like(dv)
+    grid = (NK, NV, N * H)
+    chunk_dplr_bwd_kernel_dhu[grid](
+        qg=qg,
+        bg=bg,
+        w=w,
+        gk=gk,
+        dht=dht,
+        dh0=dh0,
+        do=do,
+        dh=dh,
+        dv=dv,
+        dv2=dv2,
+        offsets=offsets,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+        BV=BV,
+        HEAD_FIRST=head_first
+    )
+    return dh, dh0, dv2

fla/ops/generalized_delta_rule/dplr/chunk_h_fwd.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from fla.ops.common.utils import prepare_chunk_offsets
+from fla.ops.utils.op import exp
+from fla.utils import check_shared_mem, use_cuda_graph
+@triton.heuristics({
+    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
+    'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
+    'USE_OFFSETS': lambda args: args['offsets'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT', 'BK', 'BV'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_dplr_fwd_kernel_h(
+    kg,
+    v,
+    w,
+    bg,
+    u,
+    v_new,
+    gk,
+    h,
+    h0,
+    ht,
+    offsets,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    NT: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_h = i_nh // H, i_nh % H
+    if USE_OFFSETS:
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+    # [BK, BV]
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        p_h0 = tl.make_block_ptr(h0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_h = tl.load(p_h0, boundary_check=(0, 1)).to(tl.float32)
+    for i_t in range(NT):
+        if HEAD_FIRST:
+            p_h = tl.make_block_ptr(h + (i_nh * NT + i_t) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        else:
+            p_h = tl.make_block_ptr(h + ((boh + i_t) * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_h, b_h.to(p_h.dtype.element_ty), boundary_check=(0, 1))
+        b_hc = tl.zeros([BK, BV], dtype=tl.float32)
+        # since we need to make all DK in the SRAM. we face serve SRAM memory burden. By subchunking we allievate such burden
+        for i_c in range(tl.cdiv(min(BT, T - i_t * BT), BC)):
+            if HEAD_FIRST:
+                p_kg = tl.make_block_ptr(kg + i_nh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_bg = tl.make_block_ptr(bg + i_nh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_w = tl.make_block_ptr(w + i_nh * T*K, (T, K), (K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+                p_v = tl.make_block_ptr(v + i_nh * T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_u = tl.make_block_ptr(u + i_nh * T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_v_new = tl.make_block_ptr(v_new+i_nh*T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            else:
+                p_kg = tl.make_block_ptr(kg+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_bg = tl.make_block_ptr(bg+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_w = tl.make_block_ptr(w+(bos*H+i_h)*K, (T, K), (H*K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+                p_v = tl.make_block_ptr(v+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_u = tl.make_block_ptr(u+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_v_new = tl.make_block_ptr(v_new+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT+i_c*BC, i_v * BV), (BC, BV), (1, 0))
+            # [BK, BC]
+            b_kg = tl.load(p_kg, boundary_check=(0, 1))
+            b_v = tl.load(p_v, boundary_check=(0, 1))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_bg = tl.load(p_bg, boundary_check=(0, 1))
+            b_v2 = tl.dot(b_w, b_h.to(b_w.dtype)) + tl.load(p_u, boundary_check=(0, 1))
+            b_hc += tl.dot(b_kg, b_v)
+            b_hc += tl.dot(b_bg.to(b_hc.dtype), b_v2)
+            tl.store(p_v_new, b_v2.to(p_v_new.dtype.element_ty), boundary_check=(0, 1))
+        last_idx = min((i_t + 1) * BT, T) - 1
+        if HEAD_FIRST:
+            b_g_last = tl.load(gk + i_nh * T * K + last_idx * K + tl.arange(0, BK), mask=tl.arange(0, BK) < K).to(tl.float32)
+        else:
+            b_g_last = tl.load(gk + (bos + last_idx) * H * K + i_h * K +
+                               tl.arange(0, BK), mask=tl.arange(0, BK) < K).to(tl.float32)
+        b_h *= exp(b_g_last[:, None])
+        b_h += b_hc
+    if STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+def chunk_dplr_fwd_h(
+    kg: torch.Tensor,
+    v: torch.Tensor,
+    w: torch.Tensor,
+    u: torch.Tensor,
+    bg: torch.Tensor,
+    gk: torch.Tensor,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if head_first:
+        B, H, T, K, V = *kg.shape, u.shape[-1]
+    else:
+        B, T, H, K, V = *kg.shape, u.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if offsets is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        N, NT, chunk_offsets = len(offsets) - 1, len(indices), prepare_chunk_offsets(offsets, BT)
+    BK = triton.next_power_of_2(K)
+    assert BK <= 256, "current kernel does not support head dimension larger than 256."
+    # H100 can have larger block size
+    if check_shared_mem('hopper', kg.device.index):
+        BV = 64
+        BC = 64 if K <= 128 else 32
+    elif check_shared_mem('ampere', kg.device.index):  # A100
+        BV = 32
+        BC = 32
+    else:
+        BV = 16
+        BC = 16
+    BC = min(BT, BC)
+    NK = triton.cdiv(K, BK)
+    NV = triton.cdiv(V, BV)
+    assert NK == 1, 'NK > 1 is not supported because it involves time-consuming synchronization'
+    if head_first:
+        h = kg.new_empty(B, H, NT, K, V)
+    else:
+        h = kg.new_empty(B, NT, H, K, V)
+    final_state = kg.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+    v_new = torch.empty_like(u)
+    grid = (NK, NV, N * H)
+    chunk_dplr_fwd_kernel_h[grid](
+        kg=kg,
+        v=v,
+        w=w,
+        bg=bg,
+        u=u,
+        v_new=v_new,
+        h=h,
+        gk=gk,
+        h0=initial_state,
+        ht=final_state,
+        offsets=offsets,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+        BV=BV,
+        NT=NT,
+        HEAD_FIRST=head_first
+    )
+    return h, v_new, final_state

fla/ops/generalized_delta_rule/dplr/chunk_o_fwd.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional
+import torch
+import triton
+import triton.language as tl
+from fla.utils import check_shared_mem, use_cuda_graph
+BK_LIST = [32, 64, 128] if check_shared_mem() else [16, 32]
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in BK_LIST
+        for BV in BK_LIST
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_dplr_fwd_kernel_o(
+    qg,
+    v,
+    v_new,
+    A_qk,
+    A_qb,
+    h,
+    o,
+    offsets,
+    indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if USE_OFFSETS:
+        i_tg = i_t
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        if HEAD_FIRST:
+            p_qg = tl.make_block_ptr(qg + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_h = tl.make_block_ptr(h + (i_bh * NT + i_t) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        else:
+            p_qg = tl.make_block_ptr(qg + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_h = tl.make_block_ptr(h + (i_tg * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_qg = tl.load(p_qg, boundary_check=(0, 1))
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        b_o += tl.dot(b_qg, b_h)
+    if HEAD_FIRST:
+        p_Aqk = tl.make_block_ptr(A_qk + i_bh * T*BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+        p_Aqb = tl.make_block_ptr(A_qb + i_bh * T*BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+        p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_v_new = tl.make_block_ptr(v_new + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_o = tl.make_block_ptr(o + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    else:
+        p_Aqk = tl.make_block_ptr(A_qk + (bos * H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+        p_Aqb = tl.make_block_ptr(A_qb + (bos * H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+        p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_v_new = tl.make_block_ptr(v_new + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_o = tl.make_block_ptr(o + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    m_s = tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :]
+    b_Aqk = tl.load(p_Aqk, boundary_check=(0, 1))
+    b_Aqb = tl.load(p_Aqb, boundary_check=(0, 1))
+    b_Aqk = tl.where(m_s, b_Aqk, 0)
+    b_Aqb = tl.where(m_s, b_Aqb, 0)
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+    b_v_new = tl.load(p_v_new, boundary_check=(0, 1))
+    b_o = b_o + tl.dot(b_Aqk.to(b_v.dtype), b_v) + tl.dot(b_Aqb.to(b_v_new.dtype), b_v_new)
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+def chunk_dplr_fwd_o(
+    qg: torch.Tensor,
+    v: torch.Tensor,
+    v_new: torch.Tensor,
+    A_qk: torch.Tensor,
+    A_qb: torch.Tensor,
+    h: torch.Tensor,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> torch.Tensor:
+    if head_first:
+        B, H, T, K, V = *qg.shape, v.shape[-1]
+    else:
+        B, T, H, K, V = *qg.shape, v.shape[-1]
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    NT = triton.cdiv(T, BT) if offsets is None else len(indices)
+    o = torch.empty_like(v)
+    def grid(meta): return (triton.cdiv(V, meta['BV']), NT, B * H)
+    chunk_dplr_fwd_kernel_o[grid](
+        qg=qg,
+        v=v,
+        v_new=v_new,
+        A_qk=A_qk,
+        A_qb=A_qb,
+        h=h,
+        o=o,
+        offsets=offsets,
+        indices=indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        HEAD_FIRST=head_first
+    )
+    return o

fla/ops/generalized_delta_rule/dplr/wy_fast_bwd.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from fla.utils import check_shared_mem, is_intel_alchemist, use_cuda_graph
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/3449
+triton_config = {'grf_mode': 'large'} if is_intel_alchemist else {}
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config(triton_config, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT', 'BK', 'BV'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def bwd_prepare_wy_repr_kernel(
+    A_ab_inv,
+    A_ak,
+    ag,
+    v,
+    dw,
+    du,
+    dv,
+    dv0,
+    dag,
+    dAak,
+    dAab,
+    offsets,
+    indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if USE_OFFSETS:
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    if HEAD_FIRST:
+        p_Aab_inv_t = tl.make_block_ptr(A_ab_inv + i_bh * T * BT, (BT, T), (1, BT), (0, i_t * BT), (BT, BT), (0, 1))
+        p_Aak_t = tl.make_block_ptr(A_ak + i_bh * T * BT, (BT, T), (1, BT), (0, i_t * BT), (BT, BT), (0, 1))
+        p_dAak = tl.make_block_ptr(dAak + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+        p_dAab = tl.make_block_ptr(dAab + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    else:
+        p_Aak_t = tl.make_block_ptr(A_ak + (bos*H + i_h) * BT,  (BT, T), (1, H*BT), (0, i_t * BT), (BT, BT), (0, 1))
+        p_Aab_inv_t = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (BT, T), (1, H*BT), (0, i_t * BT), (BT, BT), (0, 1))
+        p_dAak = tl.make_block_ptr(dAak + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+        p_dAab = tl.make_block_ptr(dAab + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    b_A_ab_inv_t = tl.load(p_Aab_inv_t, boundary_check=(0, 1))
+    b_A_ak_t = tl.load(p_Aak_t, boundary_check=(0, 1))
+    b_A_ak_t = tl.where(tl.arange(0, BT)[:, None] < tl.arange(0, BT)[None, :], b_A_ak_t, 0)
+    b_A_ab_inv_t = tl.where(tl.arange(0, BT)[:, None] <= tl.arange(0, BT)[None, :], b_A_ab_inv_t, 0)
+    b_A_tmp_t = tl.dot(b_A_ak_t, b_A_ab_inv_t).to(v.dtype.element_ty)
+    b_dA_tmp = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_v in range(tl.cdiv(V, BV)):
+        if HEAD_FIRST:
+            p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_dv = tl.make_block_ptr(dv + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_dv0 = tl.make_block_ptr(dv0 + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_du = tl.make_block_ptr(du + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        else:
+            p_v = tl.make_block_ptr(v + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_dv = tl.make_block_ptr(dv + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_dv0 = tl.make_block_ptr(dv0 + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_du = tl.make_block_ptr(du + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_du = tl.load(p_du, boundary_check=(0, 1))
+        b_dA_tmp += tl.dot(b_du.to(b_v.dtype), tl.trans(b_v))
+        b_dv0 = tl.load(p_dv0, boundary_check=(0, 1))
+        b_dv = b_dv0 + tl.dot(b_A_tmp_t, b_du)
+        tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+    b_dA_tmp = tl.where(tl.arange(0, BT)[:, None] > tl.arange(0, BT)[None, :], b_dA_tmp, 0)
+    b_dA_ak = tl.dot(b_A_ab_inv_t, b_dA_tmp)
+    b_dA_ak = tl.where(tl.arange(0, BT)[:, None] > tl.arange(0, BT)[None, :], b_dA_ak, 0)
+    tl.store(p_dAak, b_dA_ak, boundary_check=(0, 1))
+    b_dA_ab_inv = tl.dot(b_dA_tmp, b_A_ak_t)
+    for i_k in range(tl.cdiv(K, BK)):
+        if HEAD_FIRST:
+            p_ag = tl.make_block_ptr(ag + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_dag = tl.make_block_ptr(dag + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_dw = tl.make_block_ptr(dw + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        else:
+            p_ag = tl.make_block_ptr(ag + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_dag = tl.make_block_ptr(dag + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_dw = tl.make_block_ptr(dw + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_ag = tl.load(p_ag, boundary_check=(0, 1))
+        b_dw = tl.load(p_dw, boundary_check=(0, 1))
+        b_dA_ab_inv += tl.dot(b_dw, tl.trans(b_ag))
+        b_dag = tl.dot(b_A_ab_inv_t.to(b_dw.dtype), b_dw)
+        tl.store(p_dag, b_dag.to(p_dag.dtype.element_ty), boundary_check=(0, 1))
+    # if we know dL/dA^(-1), for dL/dA, we can use the following formula:
+    # dL/dA = -(A^(-1))^T @ (dL/dA^(-1)) @ (A^(-1))^T
+    # in the fwd pass we use fwd substitution to calculate (I-lower(A_ab))^-1.
+    # denote A = I - lower(A_ab), B = A^-1
+    # in the backward pass.
+    # dL/dA = -(B)^T @ (dL/dB) @ B^T
+    # dL/dA_ab = lower(B^T @ dL/dB @ B^T)
+    b_dA_ab_inv = tl.where(tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :], b_dA_ab_inv, 0)
+    b_dA_ab_inv = tl.dot(b_A_ab_inv_t, b_dA_ab_inv)
+    b_dA_ab_inv = tl.dot(b_dA_ab_inv, b_A_ab_inv_t)
+    b_dA_ab_inv = tl.where(tl.arange(0, BT)[:, None] > tl.arange(0, BT)[None, :], b_dA_ab_inv, 0)
+    tl.store(p_dAab, b_dA_ab_inv, boundary_check=(0, 1))
+def chunk_dplr_bwd_wy(
+    A_ab_inv: torch.Tensor,
+    A_ak: torch.Tensor,
+    v: torch.Tensor,
+    ag: torch.Tensor,
+    dw: torch.Tensor,
+    du: torch.Tensor,
+    dv0: torch.Tensor,
+    offsets: Optional[torch.LongTensor],
+    indices: Optional[torch.LongTensor],
+    head_first: bool,
+    chunk_size: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    A_ab_inv, A_ak, v, ag, dw, du = map(lambda x: x.contiguous(), [A_ab_inv, A_ak, v, ag, dw, du])
+    if head_first:
+        B, H, T, K, V = *dw.shape, du.shape[-1]
+    else:
+        B, T, H, K, V = *dw.shape, du.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    NT = triton.cdiv(T, BT) if offsets is None else len(indices)
+    BK = min(triton.next_power_of_2(K), 64)
+    BV = min(triton.next_power_of_2(V), 64) if check_shared_mem() else min(triton.next_power_of_2(V), 32)
+    dA_ab = torch.empty_like(A_ab_inv, dtype=torch.float)
+    dA_ak = torch.empty_like(A_ak, dtype=torch.float)
+    dv = torch.empty_like(v)
+    dag = torch.empty_like(ag)
+    bwd_prepare_wy_repr_kernel[(NT, B * H)](
+        A_ab_inv=A_ab_inv,
+        A_ak=A_ak,
+        ag=ag,
+        v=v,
+        dw=dw,
+        du=du,
+        dv=dv,
+        dv0=dv0,
+        dag=dag,
+        dAak=dA_ak,
+        dAab=dA_ab,
+        offsets=offsets,
+        indices=indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+        HEAD_FIRST=head_first
+    )
+    return dA_ab, dA_ak, dv, dag

fla/ops/generalized_delta_rule/dplr/wy_fast_fwd.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from fla.ops.utils.op import gather
+from fla.utils import is_gather_supported, use_cuda_graph
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8, 16]
+    ],
+    key=['BT'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def fwd_prepare_wy_repr_kernel_chunk32(
+    A_ab,
+    A_ab_inv,
+    offsets,
+    indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,  # placeholder, do not delete
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if USE_OFFSETS:
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    if HEAD_FIRST:
+        p_Aab = tl.make_block_ptr(A_ab + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+        p_Aab_inv = tl.make_block_ptr(A_ab_inv + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    else:
+        p_Aab = tl.make_block_ptr(A_ab + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+        p_Aab_inv = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    b_A_ab = tl.load(p_Aab, boundary_check=(0, 1))
+    b_A_ab = tl.where(tl.arange(0, BT)[:, None] > tl.arange(0, BT)[None, :], b_A_ab, 0)
+    for i in range(1, BT):
+        mask = tl.arange(0, BT) == i
+        b_a = tl.sum(tl.where(mask[:, None], b_A_ab, 0), 0)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A_ab, 0) * (tl.arange(0, BT) < i)
+        b_A_ab = tl.where(mask[:, None], b_a, b_A_ab)
+    b_A_ab += tl.arange(0, BT)[:, None] == tl.arange(0, BT)[None, :]
+    tl.store(p_Aab_inv, b_A_ab.to(p_Aab_inv.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BC'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def fwd_prepare_wy_repr_kernel_chunk64(
+    A_ab,
+    A_ab_inv,
+    offsets,
+    indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+    GATHER_SUPPORTED: tl.constexpr = is_gather_supported
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if USE_OFFSETS:
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    if HEAD_FIRST:
+        p_A1 = tl.make_block_ptr(A_ab + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BC, BC), (1, 0))
+        p_A2 = tl.make_block_ptr(A_ab + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT + BC, BC), (BC, BC), (1, 0))
+        p_A3 = tl.make_block_ptr(A_ab + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT + BC, 0), (BC, BC), (1, 0))
+        p_A_inv1 = tl.make_block_ptr(A_ab_inv + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BC, BC), (1, 0))
+        p_A_inv2 = tl.make_block_ptr(A_ab_inv + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT + BC, BC), (BC, BC), (1, 0))
+        p_A_inv3 = tl.make_block_ptr(A_ab_inv + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT + BC, 0), (BC, BC), (1, 0))
+        p_A_inv4 = tl.make_block_ptr(A_ab_inv + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, BC), (BC, BC), (1, 0))
+    else:
+        p_A1 = tl.make_block_ptr(A_ab + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BC, BC), (1, 0))
+        p_A2 = tl.make_block_ptr(A_ab + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT + BC, BC), (BC, BC), (1, 0))
+        p_A3 = tl.make_block_ptr(A_ab + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT + BC, 0), (BC, BC), (1, 0))
+        p_A_inv1 = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BC, BC), (1, 0))
+        p_A_inv2 = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT + BC, BC), (BC, BC), (1, 0))
+        p_A_inv3 = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT + BC, 0), (BC, BC), (1, 0))
+        p_A_inv4 = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, BC), (BC, BC), (1, 0))
+    b_A = tl.load(p_A1, boundary_check=(0, 1))
+    b_A2 = tl.load(p_A2, boundary_check=(0, 1))
+    b_A3 = tl.load(p_A3, boundary_check=(0, 1))
+    b_A = tl.where(tl.arange(0, BC)[:, None] > tl.arange(0, BC)[None, :], b_A, 0)
+    b_A2 = tl.where(tl.arange(0, BC)[:, None] > tl.arange(0, BC)[None, :], b_A2, 0)
+    for i in range(1, BC):
+        if GATHER_SUPPORTED:
+            row_idx = tl.full([1, BC], i, dtype=tl.int16)
+            # [1, BK] -> [BK]
+            b_a = tl.sum(gather(b_A, row_idx, axis=0), 0)
+            b_a2 = tl.sum(gather(b_A2, row_idx, axis=0), 0)
+        else:
+            mask = tl.arange(0, BC) == i
+            b_a = tl.sum(tl.where(mask[:, None], b_A, 0), 0)
+            b_a2 = tl.sum(tl.where(mask[:, None], b_A2, 0), 0)
+        mask = tl.arange(0, BC) == i
+        # b_a = tl.sum(tl.where(mask[:, None], b_A, 0), 0)
+        # b_a2 = tl.sum(tl.where(mask[:, None], b_A2, 0), 0)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A, 0) * (tl.arange(0, BC) < i)
+        b_a2 = b_a2 + tl.sum(b_a2[:, None] * b_A2, 0) * (tl.arange(0, BC) < i)
+        b_A = tl.where(mask[:, None], b_a, b_A)
+        b_A2 = tl.where(mask[:, None], b_a2, b_A2)
+    # blockwise computation of lower triangular matrix's inverse
+    # i.e., [A11, 0; A21, A22]^-1 = [A11^-1, 0; -A22^-1 A21 A11^-1, A22^-1]
+    b_A += tl.arange(0, BC)[:, None] == tl.arange(0, BC)[None, :]
+    b_A2 += tl.arange(0, BC)[:, None] == tl.arange(0, BC)[None, :]
+    b_A3 = tl.dot(tl.dot(b_A2, b_A3), b_A)
+    # tl.debug_barrier()
+    tl.store(p_A_inv1, b_A.to(p_A_inv1.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    tl.store(p_A_inv2, b_A2.to(p_A_inv2.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    tl.store(p_A_inv3, b_A3.to(p_A_inv3.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    # causal mask
+    tl.store(p_A_inv4, tl.zeros([BC, BC], dtype=tl.float32).to(p_A_inv4.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8, 16, 32]
+        for num_stages in [2, 3, 4]
+    ],
+    key=['BT', 'BK', 'BV'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def fwd_wu_kernel(
+    u,
+    w,
+    ag,
+    v,
+    A_ab_inv,
+    A_ak,
+    offsets,
+    indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if USE_OFFSETS:
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    if HEAD_FIRST:
+        p_A_ab_inv = tl.make_block_ptr(A_ab_inv + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+        p_A_ak = tl.make_block_ptr(A_ak + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    else:
+        p_A_ab_inv = tl.make_block_ptr(A_ab_inv + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+        p_A_ak = tl.make_block_ptr(A_ak + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    b_Aab_inv = tl.load(p_A_ab_inv, boundary_check=(0, 1))
+    b_Aak = tl.load(p_A_ak, boundary_check=(0, 1))
+    o_s = tl.arange(0, BT)
+    b_Aab_inv = tl.where(o_s[:, None] >= o_s[None, :], b_Aab_inv, 0)
+    b_Aak = tl.where(o_s[:, None] > o_s[None, :], b_Aak, 0)
+    # let's use tf32 here
+    b_Aak = tl.dot(b_Aab_inv, b_Aak)
+    # (SY 01/04) should be bf16 or tf32? To verify.
+    b_Aak = b_Aak.to(v.dtype.element_ty, fp_downcast_rounding="rtne")
+    b_Aab_inv = b_Aab_inv.to(ag.dtype.element_ty, fp_downcast_rounding="rtne")
+    for i_k in range(tl.cdiv(K, BK)):
+        if HEAD_FIRST:
+            p_ag = tl.make_block_ptr(ag + i_bh * T * K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_w = tl.make_block_ptr(w + i_bh * T * K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        else:
+            p_ag = tl.make_block_ptr(ag + (bos*H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_w = tl.make_block_ptr(w + (bos*H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_ag = tl.load(p_ag, boundary_check=(0, 1))
+        b_w = tl.dot(b_Aab_inv, b_ag)  # both bf16 or fp16
+        tl.store(p_w, b_w.to(p_w.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+    for i_v in range(tl.cdiv(V, BV)):
+        if HEAD_FIRST:
+            p_v = tl.make_block_ptr(v + i_bh * T * V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_u = tl.make_block_ptr(u + i_bh * T * V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        else:
+            p_v = tl.make_block_ptr(v + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_u = tl.make_block_ptr(u + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_u = tl.dot(b_Aak, b_v)  # both bf16 or fp16
+        tl.store(p_u, b_u.to(p_u.dtype.element_ty, fp_downcast_rounding="rtne"), boundary_check=(0, 1))
+def fwd_prepare_wy_repr(
+    ag: torch.Tensor,
+    v: torch.Tensor,
+    A_ak: torch.Tensor,
+    A_ab: torch.Tensor,
+    offsets: Optional[torch.LongTensor],
+    indices: Optional[torch.LongTensor],
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if head_first:
+        B, H, T, K = ag.shape
+    else:
+        B, T, H, K = ag.shape
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    NT = triton.cdiv(T, BT) if offsets is None else len(indices)
+    BC = min(BT, 32)
+    fwd_fn = fwd_prepare_wy_repr_kernel_chunk64 if BT == 64 else fwd_prepare_wy_repr_kernel_chunk32
+    A_ab_inv = torch.empty_like(A_ab)
+    fwd_fn[(NT, B * H)](
+        A_ab=A_ab,
+        A_ab_inv=A_ab_inv,
+        offsets=offsets,
+        indices=indices,
+        T=T,
+        H=H,
+        BT=BT,
+        BC=BC,
+        HEAD_FIRST=head_first
+    )
+    w, u = fwd_wu(
+        ag=ag,
+        v=v,
+        A_ak=A_ak,
+        A_ab_inv=A_ab_inv,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=BT
+    )
+    return w, u, A_ab_inv
+def fwd_wu(
+    ag: torch.Tensor,
+    v: torch.Tensor,
+    A_ak: torch.Tensor,
+    A_ab_inv: torch.Tensor,
+    offsets: Optional[torch.LongTensor],
+    indices: Optional[torch.LongTensor],
+    head_first: bool,
+    chunk_size: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if head_first:
+        B, H, T, K, V = *ag.shape, v.shape[-1]
+    else:
+        B, T, H, K, V = *ag.shape, v.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    NT = triton.cdiv(T, BT) if offsets is None else len(indices)
+    BK = min(triton.next_power_of_2(K), 64)
+    BV = min(triton.next_power_of_2(V), 64)
+    u = torch.empty_like(v)
+    w = torch.empty_like(ag)
+    fwd_wu_kernel[(NT, B*H)](
+        ag=ag,
+        v=v,
+        A_ak=A_ak,
+        A_ab_inv=A_ab_inv,
+        w=w,
+        u=u,
+        offsets=offsets,
+        indices=indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+        HEAD_FIRST=head_first
+    )
+    return w, u

fla/ops/generalized_delta_rule/iplr/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .chunk import chunk_iplr_delta_rule
+from .fused_recurrent import fused_recurrent_iplr_delta_rule
+__all__ = [
+    'chunk_iplr_delta_rule',
+    'fused_recurrent_iplr_delta_rule'
+]

fla/ops/generalized_delta_rule/iplr/__pycache__/chunk.cpython-312.pyc ADDED Viewed

Binary file (27 kB). View file

fla/ops/generalized_delta_rule/iplr/chunk.py ADDED Viewed

	@@ -0,0 +1,528 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from fla.ops.common.chunk_delta_h import prepare_chunk_offsets
+from fla.ops.generalized_delta_rule.iplr.wy_fast import fwd_prepare_wy_repr
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, check_shared_mem, input_guard, use_cuda_graph
+BKV_LIST = [64, 128] if check_shared_mem() else [32, 64]
+@triton.heuristics({
+    'USE_INITIAL_STATE': lambda args: args['h0'] is not None,
+    'STORE_FINAL_STATE': lambda args: args['ht'] is not None,
+    'USE_OFFSETS': lambda args: args['offsets'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [2, 4, 8, 16]
+    ],
+    key=['BT', 'BK', 'BV'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_generalized_iplr_delta_rule_fwd_kernel_h(
+    k,
+    v,
+    d,
+    b,
+    u,
+    v_new,
+    h,
+    h0,
+    ht,
+    offsets,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    NT: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_h = i_nh // H, i_nh % H
+    if USE_OFFSETS:
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+    # [BK, BV]
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        p_h0 = tl.make_block_ptr(h0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        b_h = tl.load(p_h0, boundary_check=(0, 1)).to(tl.float32)
+    for i_t in range(NT):
+        if HEAD_FIRST:
+            p_h = tl.make_block_ptr(h + (i_nh * NT + i_t) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        else:
+            p_h = tl.make_block_ptr(h + ((boh + i_t) * H + i_h) * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_h, b_h.to(p_h.dtype.element_ty), boundary_check=(0, 1))
+        b_hc = tl.zeros([BK, BV], dtype=tl.float32)
+        # since we need to make all DK in the SRAM. we face serve SRAM memory burden. By subchunking we allievate such burden
+        for i_c in range(tl.cdiv(min(BT, T - i_t * BT), BC)):
+            if HEAD_FIRST:
+                p_k = tl.make_block_ptr(k + i_nh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_b = tl.make_block_ptr(b + i_nh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_d = tl.make_block_ptr(d + i_nh * T*K, (T, K), (K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+                p_v = tl.make_block_ptr(v + i_nh * T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_u = tl.make_block_ptr(u + i_nh * T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_v_new = tl.make_block_ptr(v_new+i_nh*T*V, (T, V), (V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+            else:
+                p_k = tl.make_block_ptr(k+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_b = tl.make_block_ptr(b+(bos*H+i_h)*K, (K, T), (1, H*K), (i_k * BK, i_t * BT + i_c * BC), (BK, BC), (0, 1))
+                p_d = tl.make_block_ptr(d+(bos*H+i_h)*K, (T, K), (H*K, 1), (i_t * BT + i_c * BC, i_k * BK), (BC, BK), (1, 0))
+                p_v = tl.make_block_ptr(v+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_u = tl.make_block_ptr(u+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t * BT + i_c * BC, i_v * BV), (BC, BV), (1, 0))
+                p_v_new = tl.make_block_ptr(v_new+(bos*H+i_h)*V, (T, V), (H*V, 1), (i_t*BT+i_c*BC, i_v * BV), (BC, BV), (1, 0))
+            # [BK, BC]
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_v = tl.load(p_v, boundary_check=(0, 1))
+            b_d = tl.load(p_d, boundary_check=(0, 1))
+            b_b = tl.load(p_b, boundary_check=(0, 1))
+            b_v2 = tl.dot(b_d, b_h.to(b_d.dtype)) + tl.load(p_u, boundary_check=(0, 1))
+            b_hc += tl.dot(b_k, b_v)
+            b_hc += tl.dot(b_b, b_v2.to(b_k.dtype))
+            tl.store(p_v_new, b_v2.to(p_v_new.dtype.element_ty), boundary_check=(0, 1))
+        b_h += b_hc
+    if STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None,
+})
+@triton.autotune(
+    configs=[
+        triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in BKV_LIST
+        for BV in BKV_LIST
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3]
+    ],
+    key=['BT'],
+    use_cuda_graph=use_cuda_graph,
+)
+@triton.jit(do_not_specialize=['T'])
+def chunk_generalized_iplr_delta_rule_fwd_kernel_o(
+    q,
+    k,
+    v,
+    u,
+    b,
+    h,
+    o,
+    offsets,
+    indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if USE_OFFSETS:
+        i_tg = i_t
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+    # offset calculation
+    q += (i_bh * T * K) if HEAD_FIRST else ((bos * H + i_h) * K)
+    k += (i_bh * T * K) if HEAD_FIRST else ((bos * H + i_h) * K)
+    b += (i_bh * T * K) if HEAD_FIRST else ((bos * H + i_h) * K)
+    v += (i_bh * T * V) if HEAD_FIRST else ((bos * H + i_h) * V)
+    u += (i_bh * T * V) if HEAD_FIRST else ((bos * H + i_h) * V)
+    o += (i_bh * T * V) if HEAD_FIRST else ((bos * H + i_h) * V)
+    h += ((i_bh * NT + i_t) * K * V) if HEAD_FIRST else ((i_tg * H + i_h) * K * V)
+    stride_qk = K if HEAD_FIRST else H*K
+    stride_vo = V if HEAD_FIRST else H*V
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    b_Aqk = tl.zeros([BT, BT], dtype=tl.float32)
+    b_Aqb = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_q = tl.make_block_ptr(q, (T, K), (stride_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_k = tl.make_block_ptr(k, (K, T), (1, stride_qk), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        p_h = tl.make_block_ptr(h, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+        p_b = tl.make_block_ptr(b, (K, T), (1, stride_qk), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        # [BT, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        # [BK, BT]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_b = tl.load(p_b, boundary_check=(0, 1))
+        # [BK, BV]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        # [BT, BK] @ [BK, BV] -> [BT, BV]
+        b_o += tl.dot(b_q, b_h)
+        # [BT, BK] @ [BK, BT] -> [BT, BT]
+        b_Aqk += tl.dot(b_q, b_k)
+        # [BT, BK] @ [BK, BT] -> [BT, BT]
+        b_Aqb += tl.dot(b_q, b_b)
+    o_i = tl.arange(0, BT)
+    m_A = o_i[:, None] >= o_i[None, :]
+    b_Aqk = tl.where(m_A, b_Aqk, 0)
+    b_Aqb = tl.where(m_A, b_Aqb, 0)
+    p_v = tl.make_block_ptr(v, (T, V), (stride_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_u = tl.make_block_ptr(u, (T, V), (stride_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_o = tl.make_block_ptr(o, (T, V), (stride_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+    b_u = tl.load(p_u, boundary_check=(0, 1))
+    b_o = (b_o + tl.dot(b_Aqk.to(b_v.dtype), b_v) + tl.dot(b_Aqb.to(b_u.dtype), b_u)) * scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+def chunk_generalized_iplr_delta_rule_fwd_o(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    v_new: torch.Tensor,
+    b: torch.Tensor,
+    h: torch.Tensor,
+    scale: Optional[float] = None,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> torch.Tensor:
+    if head_first:
+        B, H, T, K, V = *q.shape, v.shape[-1]
+    else:
+        B, T, H, K, V = *q.shape, v.shape[-1]
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    NT = triton.cdiv(T, BT) if offsets is None else len(indices)
+    o = torch.empty_like(v)
+    def grid(meta): return (
+        triton.cdiv(V, meta['BV']),
+        NT,
+        B * H
+    )
+    chunk_generalized_iplr_delta_rule_fwd_kernel_o[grid](
+        q=q,
+        k=k,
+        v=v,
+        u=v_new,
+        b=b,
+        h=h,
+        o=o,
+        offsets=offsets,
+        indices=indices,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        HEAD_FIRST=head_first
+    )
+    return o
+def chunk_generalized_iplr_delta_rule_fwd_h(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    w: torch.Tensor,
+    u: torch.Tensor,
+    b: torch.Tensor,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if head_first:
+        B, H, T, K, V = *k.shape, u.shape[-1]
+    else:
+        B, T, H, K, V = *k.shape, u.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if offsets is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        N, NT, chunk_offsets = len(offsets) - 1, len(indices), prepare_chunk_offsets(offsets, BT)
+    BK = triton.next_power_of_2(K)
+    assert BK <= 256, "current kernel does not support head dimension larger than 256."
+    # H100 can have larger block size
+    if check_shared_mem('hopper', k.device.index):
+        BV = 64
+        BC = 64 if K <= 128 else 32
+    elif check_shared_mem('ampere', k.device.index):  # A100
+        BV = 32
+        BC = 32
+    else:
+        BV = 16
+        BC = 16
+    BC = min(BT, BC)
+    NK = triton.cdiv(K, BK)
+    NV = triton.cdiv(V, BV)
+    assert NK == 1, 'NK > 1 is not supported because it involves time-consuming synchronization'
+    if head_first:
+        h = k.new_empty(B, H, NT, K, V)
+    else:
+        h = k.new_empty(B, NT, H, K, V)
+    final_state = k.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+    v_new = torch.empty_like(u)
+    grid = (NK, NV, N * H)
+    chunk_generalized_iplr_delta_rule_fwd_kernel_h[grid](
+        k=k,
+        v=v,
+        d=w,
+        b=b,
+        u=u,
+        v_new=v_new,
+        h=h,
+        h0=initial_state,
+        ht=final_state,
+        offsets=offsets,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+        BV=BV,
+        NT=NT,
+        HEAD_FIRST=head_first
+    )
+    return h, v_new, final_state
+def chunk_generalized_iplr_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    output_final_state: bool,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+):
+    T = q.shape[2] if head_first else q.shape[1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    w, u, _ = fwd_prepare_wy_repr(
+        a=a,
+        b=b,
+        k=k,
+        v=v,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=BT
+    )
+    h, v_new, final_state = chunk_generalized_iplr_delta_rule_fwd_h(
+        k=k,
+        v=v,
+        b=b,
+        w=w,
+        u=u,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=BT
+    )
+    o = chunk_generalized_iplr_delta_rule_fwd_o(
+        q=q,
+        k=k,
+        v=v,
+        v_new=v_new,
+        b=b,
+        h=h,
+        scale=scale,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=BT
+    )
+    return o, final_state
+class ChunkGeneralizedIPLRDeltaRuleFunction(torch.autograd.Function):
+    @staticmethod
+    @input_guard
+    @autocast_custom_fwd
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        output_final_state: bool,
+        offsets: Optional[torch.LongTensor] = None,
+        head_first: bool = True
+    ):
+        chunk_size = 64
+        # 2-d indices denoting the offsets of chunks in each sequence
+        # for example, if the passed `offsets` is [0, 100, 356] and `chunk_size` is 64,
+        # then there are 2 and 4 chunks in the 1st and 2nd sequences respectively, and `indices` will be
+        # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
+        indices = None
+        if offsets is not None:
+            indices = torch.cat([torch.arange(n) for n in triton.cdiv(offsets[1:] - offsets[:-1], chunk_size).tolist()])
+            indices = torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(offsets)
+        o, final_state = chunk_generalized_iplr_delta_rule_fwd(
+            q=q,
+            k=k,
+            v=v,
+            a=a,
+            b=b,
+            scale=scale,
+            initial_state=initial_state,
+            output_final_state=output_final_state,
+            offsets=offsets,
+            indices=indices,
+            head_first=head_first,
+            chunk_size=chunk_size
+        )
+        return o.to(q.dtype), final_state
+    @staticmethod
+    @input_guard
+    @autocast_custom_bwd
+    def backward(
+        ctx,
+        do: torch.Tensor,
+        dht: torch.Tensor
+    ):
+        raise NotImplementedError(
+            "Backward pass for ChunkGeneralizedIPLRDeltaRuleFunction is not implemented yet. "
+            "Stay tuned!"
+        )
+@torch.compiler.disable
+def chunk_iplr_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = True
+):
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        a (torch.Tensor):
+            activations of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        b (torch.Tensor):
+            betas of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `True`.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+    """
+    assert q.dtype == k.dtype == v.dtype
+    assert q.dtype != torch.float32, "ChunkDeltaRuleFunction does not support float32. Please use bfloat16."
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                             f"Please flatten variable-length inputs before processing.")
+        if head_first:
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(f"The number of initial states is expected to be equal to the number of input sequences, "
+                             f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}.")
+    scale = k.shape[-1] ** -0.5 if scale is None else scale
+    o, final_state = ChunkGeneralizedIPLRDeltaRuleFunction.apply(
+        q,
+        k,
+        v,
+        a,
+        b,
+        scale,
+        initial_state,
+        output_final_state,
+        cu_seqlens,
+        head_first
+    )
+    return o, final_state

fla/ops/generalized_delta_rule/iplr/naive.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# -*- coding: utf-8 -*-
+import torch
+from einops import rearrange
+# S_t = S_t @ (I + alpha_t beta_t^T) + v_t k_t^T
+# q, k, alpha, beta [B, H, L, D_K]
+# v [B, H, L, D_V]
+def iplr_recurrence(q, k, v, alpha, beta, initial_state=None, output_final_state=True):
+    orig_dtype = q.dtype
+    b, h, l, d_k = q.shape
+    q, k, v, beta = map(lambda x: x.float(), [q, k, v, beta])
+    d_v = v.shape[-1]
+    o = torch.zeros_like(v)
+    S = torch.zeros(b, h, d_k, d_v).to(v)
+    q = q * (d_k ** -0.5)
+    if initial_state is not None:
+        S += initial_state
+    for i in range(l):
+        _k = k[:, :, i]
+        _q = q[:, :, i]
+        _v = v[:, :, i]
+        _alpha = alpha[:, :, i]
+        _beta = beta[:, :, i]
+        _kv = _k[..., None] * _v[..., None, :] + (S.clone() * _alpha[..., None]).sum(-2, keepdim=True) * _beta[..., None]
+        S = S + _kv
+        o[:, :, i] = torch.einsum('bhd,bhdm->bhm', _q, S)
+    S = None if output_final_state is False else S
+    return o.to(orig_dtype), S
+def iplr_chunkwise(q, k, v, alpha, beta, initial_state=None, output_final_state=True, chunk_size=32):
+    b, h, l, d_k = q.shape
+    d_v = v.shape[-1]
+    q = q * (d_k ** -0.5)
+    v = v
+    assert l % chunk_size == 0
+    S = k.new_zeros(b, h, d_k, d_v)
+    if initial_state is not None:
+        S += initial_state
+    # note that diagonal is masked.
+    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=q.device), diagonal=0)
+    q, k, v, alpha, beta = map(lambda x: rearrange(x, 'b h (n c) d -> b h n c d', c=chunk_size), [q, k, v, alpha, beta])
+    v2 = (alpha @ k.transpose(-1, -2)).masked_fill_(mask, 0) @ v
+    attn = (alpha @ beta.transpose(-1, -2)).masked_fill(mask, 0)
+    for i in range(1, chunk_size):
+        attn[..., i, :i] = attn[..., i, :i] + (attn[..., i, :, None].clone() * attn[..., :, :i].clone()).sum(-2)
+    attn = attn + torch.eye(chunk_size, dtype=torch.float, device=q.device)
+    u = attn @ v2
+    w = attn @ alpha
+    o = torch.zeros_like(v)
+    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=q.device), diagonal=1)
+    for i in range(0, l // chunk_size):
+        q_i, k_i, v_i, u_i, w_i, beta_i = q[:, :, i], k[:, :, i], v[:, :, i], u[:, :, i], w[:, :, i], beta[:, :, i]
+        o_1 = (q_i @ k_i.transpose(-1, -2)).masked_fill_(mask, 0) @ v_i
+        v2_i = u_i + w_i @ S
+        o_2 = (q_i @ beta_i.transpose(-1, -2)).masked_fill_(mask, 0) @ (v2_i)
+        o_3 = q_i @ S
+        o[:, :, i] = o_1 + o_2 + o_3
+        S = S + k_i.transpose(-1, -2) @ v_i + beta_i.transpose(-1, -2) @ v2_i
+    S = None if output_final_state is False else S
+    return rearrange(o, 'b h n c d -> b h (n c) d'), S

fla/ops/generalized_delta_rule/iplr/wy_fast.py ADDED Viewed

	@@ -0,0 +1,338 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from fla.utils import check_shared_mem, is_nvidia_hopper
+NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8]
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8, 16]
+    ],
+    key=['BK']
+)
+@triton.jit(do_not_specialize=['T'])
+def fwd_prepare_wy_repr_kernel_chunk32(
+    a,
+    b,
+    A,
+    offsets,
+    indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BC: tl.constexpr,  # dummy placeholder
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if USE_OFFSETS:
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    b_A = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        if HEAD_FIRST:
+            p_a = tl.make_block_ptr(a + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_b = tl.make_block_ptr(b + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        else:
+            p_a = tl.make_block_ptr(a + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_b = tl.make_block_ptr(b + (bos * H + i_h) * K, (K, T), (1, K*H), (i_k * BK, i_t * BT), (BK, BT), (0, 1))
+        b_a = tl.load(p_a, boundary_check=(0, 1))
+        b_b = tl.load(p_b, boundary_check=(0, 1))
+        b_A += tl.dot(b_a, b_b)
+    b_A = tl.where(tl.arange(0, BT)[:, None] > tl.arange(0, BT)[None, :], b_A, 0)
+    for i in range(1, BT):
+        mask = tl.arange(0, BT) == i
+        b_a = tl.sum(tl.where(mask[:, None], b_A, 0), 0)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A, 0) * (tl.arange(0, BT) < i)
+        b_A = tl.where(mask[:, None], b_a, b_A)
+    b_A += tl.arange(0, BT)[:, None] == tl.arange(0, BT)[None, :]
+    if HEAD_FIRST:
+        p_A = tl.make_block_ptr(A + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    else:
+        p_A = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in [1, 2, 4, 8, 16]
+    ],
+    key=['BK']
+)
+@triton.jit(do_not_specialize=['T'])
+def fwd_prepare_wy_repr_kernel_chunk64(
+    a,
+    b,
+    A,
+    offsets,
+    indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BC: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if USE_OFFSETS:
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    b_A = tl.zeros([BC, BC], dtype=tl.float32)
+    b_A2 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_A3 = tl.zeros([BC, BC], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        if HEAD_FIRST:
+            p_a1 = tl.make_block_ptr(a + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0))
+            p_a2 = tl.make_block_ptr(a + i_bh * T*K, (T, K), (K, 1), (i_t * BT + BC, i_k * BK), (BC, BK), (1, 0))
+            p_b1 = tl.make_block_ptr(b + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BC), (0, 1))
+            p_b2 = tl.make_block_ptr(b + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + BC), (BK, BC), (0, 1))
+        else:
+            p_a1 = tl.make_block_ptr(a + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BC, BK), (1, 0))
+            p_a2 = tl.make_block_ptr(a + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT + BC, i_k * BK), (BC, BK), (1, 0))
+            p_b1 = tl.make_block_ptr(b + (bos * H + i_h) * K, (K, T), (1, K*H), (i_k * BK, i_t * BT), (BK, BC), (0, 1))
+            p_b2 = tl.make_block_ptr(b + (bos * H + i_h) * K, (K, T), (1, K*H), (i_k * BK, i_t * BT + BC), (BK, BC), (0, 1))
+        b_a1 = tl.load(p_a1, boundary_check=(0, 1))
+        b_a2 = tl.load(p_a2, boundary_check=(0, 1))
+        b_b1 = tl.load(p_b1, boundary_check=(0, 1))
+        b_b2 = tl.load(p_b2, boundary_check=(0, 1))
+        b_A += tl.dot(b_a1, b_b1, allow_tf32=False)
+        b_A2 += tl.dot(b_a2, b_b2, allow_tf32=False)
+        b_A3 += tl.dot(b_a2, b_b1, allow_tf32=False)
+    b_A = tl.where(tl.arange(0, BC)[:, None] > tl.arange(0, BC)[None, :], b_A, 0)
+    b_A2 = tl.where(tl.arange(0, BC)[:, None] > tl.arange(0, BC)[None, :], b_A2, 0)
+    for i in range(1, BC):
+        mask = tl.arange(0, BC) == i
+        b_a = tl.sum(tl.where(mask[:, None], b_A, 0), 0)
+        b_a2 = tl.sum(tl.where(mask[:, None], b_A2, 0), 0)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A, 0) * (tl.arange(0, BC) < i)
+        b_a2 = b_a2 + tl.sum(b_a2[:, None] * b_A2, 0) * (tl.arange(0, BC) < i)
+        b_A = tl.where(mask[:, None], b_a, b_A)
+        b_A2 = tl.where(mask[:, None], b_a2, b_A2)
+    # blockwise computation of lower triangular matrix's inverse
+    # i.e., [A11, 0; A21, A22]^-1 = [A11^-1, 0; -A22^-1 A21 A11^-1, A22^-1]
+    b_A += tl.arange(0, BC)[:, None] == tl.arange(0, BC)[None, :]
+    b_A2 += tl.arange(0, BC)[:, None] == tl.arange(0, BC)[None, :]
+    b_A3 = tl.dot(tl.dot(b_A2, b_A3, allow_tf32=False), b_A, allow_tf32=False)
+    if HEAD_FIRST:
+        p_A1 = tl.make_block_ptr(A + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BC, BC), (1, 0))
+        p_A2 = tl.make_block_ptr(A + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT + BC, BC), (BC, BC), (1, 0))
+        p_A3 = tl.make_block_ptr(A + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT + BC, 0), (BC, BC), (1, 0))
+        p_A4 = tl.make_block_ptr(A + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, BC), (BC, BC), (1, 0))
+    else:
+        p_A1 = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BC, BC), (1, 0))
+        p_A2 = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT + BC, BC), (BC, BC), (1, 0))
+        p_A3 = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT + BC, 0), (BC, BC), (1, 0))
+        p_A4 = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, BC), (BC, BC), (1, 0))
+    tl.store(p_A1, b_A.to(p_A1.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_A2, b_A2.to(p_A2.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_A3, b_A3.to(p_A3.dtype.element_ty), boundary_check=(0, 1))
+    # causal mask
+    tl.store(p_A4, tl.zeros([BC, BC], dtype=tl.float32).to(p_A4.dtype.element_ty), boundary_check=(0, 1))
+@triton.heuristics({
+    'USE_OFFSETS': lambda args: args['offsets'] is not None
+})
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps)
+        for num_warps in NUM_WARPS
+    ],
+    key=['BT', 'BK', 'BV']
+)
+@triton.jit(do_not_specialize=['T'])
+def fwd_wu_kernel(
+    w,
+    u,
+    a,
+    k,
+    v,
+    A,
+    offsets,
+    indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_OFFSETS: tl.constexpr,
+    HEAD_FIRST: tl.constexpr
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if USE_OFFSETS:
+        i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    if HEAD_FIRST:
+        p_A = tl.make_block_ptr(A + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    else:
+        p_A = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (H*BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_Aak = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        if HEAD_FIRST:
+            p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_a = tl.make_block_ptr(a + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_w = tl.make_block_ptr(w + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        else:
+            p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_a = tl.make_block_ptr(a + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_w = tl.make_block_ptr(w + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_a = tl.load(p_a, boundary_check=(0, 1))
+        b_w = tl.dot(b_A, b_a)
+        b_Aak += tl.dot(b_a, tl.trans(b_k))
+        tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1))
+    b_Aak = tl.where(tl.arange(0, BT)[:, None] > tl.arange(0, BT)[None, :], b_Aak, 0)
+    b_Aak = b_Aak.to(k.dtype.element_ty)
+    for i_v in range(tl.cdiv(V, BV)):
+        if HEAD_FIRST:
+            p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_u = tl.make_block_ptr(u + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        else:
+            p_v = tl.make_block_ptr(v + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            p_u = tl.make_block_ptr(u + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_v = tl.dot(b_Aak, b_v).to(v.dtype.element_ty)
+        b_u = tl.dot(b_A, b_v)
+        tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1))
+def fwd_prepare_wy_repr(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    v: torch.Tensor,
+    k: torch.Tensor,
+    offsets: Optional[torch.LongTensor],
+    indices: Optional[torch.LongTensor],
+    head_first: bool = True,
+    chunk_size: int = 64
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if head_first:
+        B, H, T, K = a.shape
+    else:
+        B, T, H, K = a.shape
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    NT = triton.cdiv(T, BT) if offsets is None else len(indices)
+    BC = min(BT, 32)
+    BK = min(triton.next_power_of_2(K), 64)
+    A = torch.empty(B, *((H, T) if head_first else (T, H)), BT, device=a.device, dtype=a.dtype)
+    fwd_fn = fwd_prepare_wy_repr_kernel_chunk64 if BT == 64 else fwd_prepare_wy_repr_kernel_chunk32
+    fwd_fn[(NT, B * H)](
+        a=a,
+        b=b,
+        A=A,
+        offsets=offsets,
+        indices=indices,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BK=BK,
+        BC=BC,
+        HEAD_FIRST=head_first
+    )
+    w, u = fwd_wu(
+        a=a,
+        v=v,
+        k=k,
+        A=A,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=chunk_size
+    )
+    return w, u, A
+def fwd_wu(
+    a: torch.Tensor,
+    v: torch.Tensor,
+    k: torch.Tensor,
+    A: torch.Tensor,
+    offsets: Optional[torch.LongTensor],
+    indices: Optional[torch.LongTensor],
+    head_first: bool,
+    chunk_size: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if head_first:
+        B, H, T, K, V = *a.shape, v.shape[-1]
+    else:
+        B, T, H, K, V = *a.shape, v.shape[-1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    NT = triton.cdiv(T, BT) if offsets is None else len(indices)
+    CONST_TILING = 64 if check_shared_mem() else 32
+    BK = min(triton.next_power_of_2(K), CONST_TILING)
+    BV = min(triton.next_power_of_2(V), CONST_TILING)
+    u = torch.empty_like(v)
+    w = torch.empty_like(a)
+    fwd_wu_kernel[(NT, B*H)](
+        a=a,
+        v=v,
+        w=w,
+        u=u,
+        A=A,
+        k=k,
+        offsets=offsets,
+        indices=indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+        HEAD_FIRST=head_first
+    )
+    return w, u