zaydzuhri commited on Sep 15, 2025

Commit

bd301da

verified ·

1 Parent(s): 268966a

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fla/models/abc/configuration_abc.py +91 -0
fla/models/forgetting_transformer/__init__.py +16 -0
fla/models/gla/__init__.py +13 -0
fla/models/nsa/__init__.py +15 -0
fla/models/transformer_top/configuration_transformer.py +76 -0
fla/modules/__pycache__/activations.cpython-312.pyc +0 -0
fla/modules/__pycache__/convolution.cpython-312.pyc +0 -0
fla/modules/__pycache__/fused_linear_cross_entropy.cpython-312.pyc +0 -0
fla/modules/__pycache__/layernorm.cpython-312.pyc +0 -0
fla/ops/common/__pycache__/chunk_scaled_dot_kkt.cpython-312.pyc +0 -0
fla/ops/delta_rule/__pycache__/__init__.cpython-312.pyc +0 -0
fla/ops/delta_rule/__pycache__/chunk.cpython-312.pyc +0 -0
fla/ops/delta_rule/__pycache__/fused_chunk.cpython-312.pyc +0 -0
fla/ops/gated_delta_rule/__pycache__/chunk.cpython-312.pyc +0 -0
fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_A_bwd.cpython-312.pyc +0 -0
fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_o_bwd.cpython-312.pyc +0 -0
fla/ops/generalized_delta_rule/dplr/chunk.py +388 -0
fla/ops/generalized_delta_rule/iplr/__pycache__/fused_recurrent.cpython-312.pyc +0 -0
fla/ops/generalized_delta_rule/iplr/__pycache__/wy_fast.cpython-312.pyc +0 -0
fla/ops/gsa/__pycache__/chunk.cpython-312.pyc +0 -0
fla/ops/hgrn/__pycache__/chunk.cpython-312.pyc +0 -0
fla/ops/hgrn/__pycache__/fused_recurrent.cpython-312.pyc +0 -0
fla/ops/ttt/__pycache__/chunk.cpython-312.pyc +0 -0
fla/ops/utils/__pycache__/__init__.cpython-312.pyc +0 -0
logs/none_enyj3lod/attempt_0/3/stderr.log +0 -0
profile_trace/iteration_17408/rank4_trace.json +0 -0
profile_trace/iteration_18944/rank2_trace.json +0 -0
profile_trace/iteration_25088/rank3_trace.json +0 -0
profile_trace/iteration_25088/rank7_trace.json +0 -0
profile_trace/iteration_33280/rank6_trace.json +0 -0
profile_trace/iteration_34816/rank5_trace.json +0 -0
profile_trace/iteration_38912/rank1_trace.json +0 -0
profile_trace/iteration_38912/rank2_trace.json +0 -0
profile_trace/iteration_7680/rank0_trace.json +0 -0
profile_trace/iteration_7680/rank4_trace.json +0 -0
torchtitan/components/dataloader.py +92 -0
torchtitan/components/float8.py +150 -0
torchtitan/components/optimizer.py +303 -0
torchtitan/datasets/__pycache__/hf_datasets.cpython-312.pyc +0 -0
torchtitan/datasets/hf_datasets.py +173 -0
torchtitan/datasets/tokenizer/__pycache__/tiktoken.cpython-312.pyc +0 -0
torchtitan/datasets/tokenizer/tiktoken.py +190 -0
torchtitan/distributed/__pycache__/__init__.cpython-312.pyc +0 -0
torchtitan/distributed/__pycache__/utils.cpython-312.pyc +0 -0
torchtitan/experiments/deepseek_v3/inference.sh +15 -0
torchtitan/experiments/deepseek_v3/model_config.py +204 -0
torchtitan/experiments/flux/README.md +23 -0
torchtitan/experiments/flux/__pycache__/parallelize_flux.cpython-312.pyc +0 -0
torchtitan/experiments/flux/flux_argparser.py +42 -0
torchtitan/experiments/flux/loss.py +27 -0

fla/models/abc/configuration_abc.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# -*- coding: utf-8 -*-
+from typing import Dict, Optional
+from transformers.configuration_utils import PretrainedConfig
+class ABCConfig(PretrainedConfig):
+    model_type = 'abc'
+    keys_to_ignore_at_inference = ['past_key_values']
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        gate_low_rank_dim: int = 16,
+        clamp_min: float = -32,
+        clamp_max: float = 32,
+        hidden_ratio: Optional[int] = 4,
+        intermediate_size: Optional[int] = None,
+        num_hidden_layers: int = 24,
+        num_heads: int = 4,
+        num_slots: Optional[int] = 64,
+        use_short_conv: bool = False,
+        conv_size: int = 4,
+        exapnd_k: float = 0.5,
+        exapnd_v: float = 1,
+        hidden_act: str = "swish",
+        max_position_embeddings: int = 2048,
+        elementwise_affine: Optional[bool] = True,
+        norm_eps: float = 1e-6,
+        use_rope: bool = True,
+        attn: Optional[Dict] = None,
+        use_cache: bool = True,
+        pad_token_id: int = None,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = False,
+        initializer_range: float = 0.006,
+        fuse_norm: bool = True,
+        fuse_swiglu: bool = True,
+        fuse_cross_entropy: bool = True,
+        vocab_size: int = 32000,
+        **kwargs
+    ):
+        self.hidden_size = hidden_size
+        self.gate_low_rank_dim = gate_low_rank_dim
+        self.clamp_min = clamp_min
+        self.clamp_max = clamp_max
+        self.hidden_ratio = hidden_ratio
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_heads = num_heads
+        self.num_slots = num_slots
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.expand_k = exapnd_k
+        self.expand_v = exapnd_v
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.elementwise_affine = elementwise_affine
+        self.norm_eps = norm_eps
+        self.use_rope = use_rope
+        self.attn = attn
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.fuse_norm = fuse_norm
+        self.fuse_swiglu = fuse_swiglu
+        self.fuse_cross_entropy = fuse_cross_entropy
+        self.vocab_size = vocab_size
+        if attn is not None:
+            if not isinstance(attn, Dict):
+                raise ValueError("attn must be a dictionary")
+            if 'layers' not in attn:
+                raise ValueError("Layer indices must be provided to initialize hybrid attention layers")
+            if 'num_heads' not in attn:
+                raise ValueError("Number of heads must be provided to initialize hybrid attention layers")
+            attn['num_kv_heads'] = attn.get('num_kv_heads', attn['num_heads'])
+            attn['qkv_bias'] = attn.get('qkv_bias', False)
+            attn['window_size'] = attn.get('window_size', None)
+            attn['rope_theta'] = attn.get('rope_theta', 10000.)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

fla/models/forgetting_transformer/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# -*- coding: utf-8 -*-
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+from fla.models.forgetting_transformer.configuration_forgetting_transformer import ForgettingTransformerConfig
+from fla.models.forgetting_transformer.modeling_forgetting_transformer import (
+    ForgettingTransformerForCausalLM,
+    ForgettingTransformerModel
+)
+AutoConfig.register(ForgettingTransformerConfig.model_type, ForgettingTransformerConfig)
+AutoModel.register(ForgettingTransformerConfig, ForgettingTransformerModel)
+AutoModelForCausalLM.register(ForgettingTransformerConfig, ForgettingTransformerForCausalLM)
+__all__ = ['ForgettingTransformerConfig', 'ForgettingTransformerForCausalLM', 'ForgettingTransformerModel']

fla/models/gla/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# -*- coding: utf-8 -*-
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+from fla.models.gla.configuration_gla import GLAConfig
+from fla.models.gla.modeling_gla import GLAForCausalLM, GLAModel
+AutoConfig.register(GLAConfig.model_type, GLAConfig)
+AutoModel.register(GLAConfig, GLAModel)
+AutoModelForCausalLM.register(GLAConfig, GLAForCausalLM)
+__all__ = ['GLAConfig', 'GLAForCausalLM', 'GLAModel']

fla/models/nsa/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# -*- coding: utf-8 -*-
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+from fla.models.nsa.configuration_nsa import NSAConfig
+from fla.models.nsa.modeling_nsa import NSAForCausalLM, NSAModel
+AutoConfig.register(NSAConfig.model_type, NSAConfig)
+AutoModel.register(NSAConfig, NSAModel)
+AutoModelForCausalLM.register(NSAConfig, NSAForCausalLM)
+__all__ = [
+    'NSAConfig', 'NSAModel', 'NSAForCausalLM',
+]

fla/models/transformer_top/configuration_transformer.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# -*- coding: utf-8 -*-
+from typing import Optional
+from transformers.configuration_utils import PretrainedConfig
+class TOPTransformerConfig(PretrainedConfig):
+    model_type = 'top_transformer'
+    keys_to_ignore_at_inference = ['past_key_values']
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        num_hidden_layers: int = 24,
+        num_heads: int = 32,
+        num_kv_heads: int = None,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        window_size: Optional[int] = None,
+        rope_theta: Optional[float] = 10000.,
+        max_position_embeddings: int = 2048,
+        hidden_ratio: Optional[int] = 4,
+        intermediate_size: Optional[int] = None,
+        hidden_act: str = "swish",
+        initializer_range: float = 0.006,
+        elementwise_affine: Optional[bool] = True,
+        norm_eps: float = 1e-6,
+        use_cache: bool = True,
+        pad_token_id: int = None,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = False,
+        fuse_norm: bool = True,
+        fuse_swiglu: bool = True,
+        fuse_cross_entropy: bool = True,
+        vocab_size: int = 32000,
+        use_top_loss: bool = False,
+        top_window_size: Optional[int] = None,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.qkv_bias = qkv_bias
+        self.qk_norm = qk_norm
+        self.window_size = window_size
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_ratio = hidden_ratio
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.elementwise_affine = elementwise_affine
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+        self.fuse_norm = fuse_norm
+        self.fuse_swiglu = fuse_swiglu
+        self.fuse_cross_entropy = fuse_cross_entropy
+        self.vocab_size = vocab_size
+        self.use_top_loss = use_top_loss
+        self.top_window_size = top_window_size if top_window_size is not None else max_position_embeddings
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

fla/modules/__pycache__/activations.cpython-312.pyc ADDED Viewed

Binary file (23 kB). View file

fla/modules/__pycache__/convolution.cpython-312.pyc ADDED Viewed

Binary file (21 kB). View file

fla/modules/__pycache__/fused_linear_cross_entropy.cpython-312.pyc ADDED Viewed

Binary file (20.6 kB). View file

fla/modules/__pycache__/layernorm.cpython-312.pyc ADDED Viewed

Binary file (43.4 kB). View file

fla/ops/common/__pycache__/chunk_scaled_dot_kkt.cpython-312.pyc ADDED Viewed

Binary file (6.74 kB). View file

fla/ops/delta_rule/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (361 Bytes). View file

fla/ops/delta_rule/__pycache__/chunk.cpython-312.pyc ADDED Viewed

Binary file (13.3 kB). View file

fla/ops/delta_rule/__pycache__/fused_chunk.cpython-312.pyc ADDED Viewed

Binary file (392 Bytes). View file

fla/ops/gated_delta_rule/__pycache__/chunk.cpython-312.pyc ADDED Viewed

Binary file (14.4 kB). View file

fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_A_bwd.cpython-312.pyc ADDED Viewed

Binary file (30.6 kB). View file

fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_o_bwd.cpython-312.pyc ADDED Viewed

Binary file (28 kB). View file

fla/ops/generalized_delta_rule/dplr/chunk.py ADDED Viewed

	@@ -0,0 +1,388 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from typing import Optional
+import torch
+import triton
+from fla.ops.common.utils import prepare_chunk_indices
+from fla.ops.generalized_delta_rule.dplr.chunk_A_bwd import chunk_dplr_bwd_dqk_intra
+from fla.ops.generalized_delta_rule.dplr.chunk_A_fwd import chunk_fwd_intra_dplr_fn
+from fla.ops.generalized_delta_rule.dplr.chunk_h_bwd import chunk_dplr_bwd_dhu
+from fla.ops.generalized_delta_rule.dplr.chunk_h_fwd import chunk_dplr_fwd_h
+from fla.ops.generalized_delta_rule.dplr.chunk_o_bwd import chunk_dplr_bwd_dAu, chunk_dplr_bwd_dv, chunk_dplr_bwd_o
+from fla.ops.generalized_delta_rule.dplr.chunk_o_fwd import chunk_dplr_fwd_o
+from fla.ops.generalized_delta_rule.dplr.wy_fast_bwd import chunk_dplr_bwd_wy
+from fla.ops.generalized_delta_rule.dplr.wy_fast_fwd import fwd_prepare_wy_repr
+from fla.ops.rwkv6.chunk import chunk_rwkv6_fwd_cumsum
+from fla.utils import autocast_custom_bwd, autocast_custom_fwd, input_guard
+def chunk_dplr_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    gk: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    output_final_state: bool,
+    offsets: Optional[torch.LongTensor] = None,
+    indices: Optional[torch.LongTensor] = None,
+    head_first: bool = True,
+    chunk_size: int = 64
+):
+    T = q.shape[2] if head_first else q.shape[1]
+    BT = min(chunk_size, max(triton.next_power_of_2(T), 16))
+    gi, ge = chunk_rwkv6_fwd_cumsum(gk, BT, offsets=offsets, indices=indices, head_first=head_first)
+    A_ab, A_qk, A_ak, A_qb, qg, kg, ag, bg = chunk_fwd_intra_dplr_fn(
+        q=q,
+        k=k,
+        a=a,
+        b=b,
+        gi=gi,
+        ge=ge,
+        scale=scale,
+        offsets=offsets,
+        indices=indices,
+        chunk_size=BT,
+        head_first=head_first
+    )
+    del ge
+    # A_ab, A_ak, gi, ge torch.float32
+    # A_qk, A_qb, qg, kg, ag, bg, dtype=q.dtype, eg: bf16
+    w, u, _ = fwd_prepare_wy_repr(
+        ag=ag,
+        A_ab=A_ab,
+        A_ak=A_ak,
+        v=v,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=BT
+    )
+    del A_ab, A_ak
+    h, v_new, final_state = chunk_dplr_fwd_h(
+        kg=kg,
+        bg=bg,
+        v=v,
+        w=w,
+        u=u,
+        gk=gi,
+        initial_state=initial_state,
+        output_final_state=output_final_state,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=BT
+    )
+    del u, kg, bg, gi
+    o = chunk_dplr_fwd_o(
+        qg=qg,
+        v=v,
+        v_new=v_new,
+        A_qk=A_qk,
+        A_qb=A_qb,
+        h=h,
+        offsets=offsets,
+        indices=indices,
+        head_first=head_first,
+        chunk_size=BT
+    )
+    del v_new, h, A_qk, A_qb
+    return o, final_state
+class ChunkDPLRDeltaRuleFunction(torch.autograd.Function):
+    @staticmethod
+    @input_guard
+    @autocast_custom_fwd
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        gk: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        output_final_state: bool,
+        offsets: Optional[torch.LongTensor] = None,
+        head_first: bool = True
+    ):
+        chunk_size = 16
+        # 2-d indices denoting the offsets of chunks in each sequence
+        # for example, if the passed `offsets` is [0, 100, 356] and `chunk_size` is 64,
+        # then there are 2 and 4 chunks in the 1st and 2nd sequences respectively, and `indices` will be
+        # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]]
+        indices = prepare_chunk_indices(offsets, chunk_size) if offsets is not None else None
+        o, final_state = chunk_dplr_fwd(
+            q=q,
+            k=k,
+            v=v,
+            a=a,
+            b=b,
+            gk=gk,
+            scale=scale,
+            initial_state=initial_state,
+            output_final_state=output_final_state,
+            offsets=offsets,
+            indices=indices,
+            head_first=head_first,
+            chunk_size=chunk_size
+        )
+        ctx.save_for_backward(q, k, v, a, b, gk, initial_state)
+        ctx.head_first = head_first
+        ctx.offsets = offsets
+        ctx.indices = indices
+        ctx.scale = scale
+        ctx.chunk_size = chunk_size
+        return o.to(q.dtype), final_state
+    @staticmethod
+    @input_guard
+    @autocast_custom_bwd
+    def backward(
+        ctx,
+        do: torch.Tensor,
+        dht: torch.Tensor
+    ):
+        q, k, v, a, b, gk, initial_state = ctx.saved_tensors
+        BT = ctx.chunk_size
+        head_first = ctx.head_first
+        offsets = ctx.offsets
+        indices = ctx.indices
+        scale = ctx.scale
+        # ******* start recomputing everything, otherwise i believe the gpu memory will be exhausted *******
+        gi, ge = chunk_rwkv6_fwd_cumsum(gk, BT, offsets=offsets, indices=indices, head_first=head_first)
+        A_ab, A_qk, A_ak, A_qb, qg, kg, ag, bg = chunk_fwd_intra_dplr_fn(
+            q=q,
+            k=k,
+            a=a,
+            b=b,
+            gi=gi,
+            ge=ge,
+            scale=scale,
+            offsets=offsets,
+            indices=indices,
+            chunk_size=BT,
+            head_first=head_first
+        )
+        w, u, A_ab_inv = fwd_prepare_wy_repr(
+            ag=ag,
+            A_ab=A_ab,
+            A_ak=A_ak,
+            v=v,
+            offsets=offsets,
+            indices=indices,
+            head_first=head_first,
+            chunk_size=BT
+        )
+        del A_ab
+        h, v_new, _ = chunk_dplr_fwd_h(
+            kg=kg,
+            bg=bg,
+            v=v,
+            w=w,
+            u=u,
+            gk=gi,
+            initial_state=initial_state,
+            offsets=offsets,
+            indices=indices,
+            head_first=head_first,
+            chunk_size=BT
+        )
+        del u
+        # ******* end of recomputation *******
+        # A_ak, A_ab_inv, gi, ge torch.float32
+        # A_qk, A_qb, qg, kg, ag, bg, v_new dtype=q.dtype, eg: bf16
+        dv_new_intra, dA_qk, dA_qb = chunk_dplr_bwd_dAu(
+            v=v,
+            v_new=v_new,
+            do=do,
+            A_qb=A_qb,
+            scale=scale,
+            offsets=offsets,
+            indices=indices,
+            head_first=head_first,
+            chunk_size=BT
+        )
+        dh, dh0, dv_new = chunk_dplr_bwd_dhu(
+            qg=qg,
+            bg=bg,
+            w=w,
+            gk=gi,
+            h0=initial_state,
+            dht=dht,
+            do=do,
+            dv=dv_new_intra,
+            offsets=offsets,
+            indices=indices,
+            head_first=head_first,
+            chunk_size=BT
+        )
+        dv = chunk_dplr_bwd_dv(
+            A_qk=A_qk,
+            kg=kg,
+            do=do,
+            dh=dh,
+            offsets=offsets,
+            indices=indices,
+            head_first=head_first,
+            chunk_size=BT
+        )
+        del A_qk
+        dqg, dkg, dw, dbg, dgk_last = chunk_dplr_bwd_o(
+            k=kg,
+            b=bg,
+            v=v,
+            v_new=v_new,
+            do=do,
+            h=h,
+            dh=dh,
+            dv=dv_new,
+            w=w,
+            gk=gi,
+            offsets=offsets,
+            indices=indices,
+            chunk_size=BT,
+            scale=scale,
+            head_first=head_first,
+        )
+        del v_new
+        dA_ab, dA_ak, dv, dag = chunk_dplr_bwd_wy(
+            A_ab_inv=A_ab_inv,
+            A_ak=A_ak,
+            v=v,
+            ag=ag,
+            dw=dw,
+            du=dv_new,
+            dv0=dv,
+            offsets=offsets,
+            indices=indices,
+            head_first=head_first,
+            chunk_size=BT
+        )
+        del A_ak
+        dq, dk, da, db, dgk = chunk_dplr_bwd_dqk_intra(
+            q=q,
+            k=k,
+            a=a,
+            b=b,
+            gi=gi,
+            ge=ge,
+            dAqk=dA_qk,
+            dAqb=dA_qb,
+            dAak=dA_ak,
+            dAab=dA_ab,
+            dgk_last=dgk_last,
+            dqg=dqg,
+            dkg=dkg,
+            dag=dag,
+            dbg=dbg,
+            chunk_size=BT,
+            scale=scale,
+            head_first=head_first,
+            offsets=offsets,
+            indices=indices
+        )
+        return dq.to(q), dk.to(k), dv.to(v), da.to(a), db.to(b), dgk.to(gk), None, dh0, None, None, None
+@torch.compiler.disable
+def chunk_dplr_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    gk: torch.Tensor,
+    scale: Optional[float] = None,
+    initial_state: Optional[torch.Tensor] = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False
+):
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        a (torch.Tensor):
+            activations of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        b (torch.Tensor):
+            betas of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`.
+        gk (torch.Tensor):
+            gk of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]`. decay term in log space!
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `False`.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+    """
+    assert q.dtype == k.dtype == v.dtype
+    # assert q.dtype != torch.float32, "ChunkDeltaRuleFunction does not support float32. Please use bfloat16."
+    # gk = gk.float()
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                             f"Please flatten variable-length inputs before processing.")
+        if head_first:
+            raise RuntimeError("Sequences with variable lengths are not supported for head-first mode")
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(f"The number of initial states is expected to be equal to the number of input sequences, "
+                             f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}.")
+    scale = k.shape[-1] ** -0.5 if scale is None else scale
+    o, final_state = ChunkDPLRDeltaRuleFunction.apply(
+        q,
+        k,
+        v,
+        a,
+        b,
+        gk,
+        scale,
+        initial_state,
+        output_final_state,
+        cu_seqlens,
+        head_first
+    )
+    return o, final_state

fla/ops/generalized_delta_rule/iplr/__pycache__/fused_recurrent.cpython-312.pyc ADDED Viewed

Binary file (27.4 kB). View file

fla/ops/generalized_delta_rule/iplr/__pycache__/wy_fast.cpython-312.pyc ADDED Viewed

Binary file (23.1 kB). View file

fla/ops/gsa/__pycache__/chunk.cpython-312.pyc ADDED Viewed

Binary file (69.4 kB). View file

fla/ops/hgrn/__pycache__/chunk.cpython-312.pyc ADDED Viewed

Binary file (16.2 kB). View file

fla/ops/hgrn/__pycache__/fused_recurrent.cpython-312.pyc ADDED Viewed

Binary file (14.3 kB). View file

fla/ops/ttt/__pycache__/chunk.cpython-312.pyc ADDED Viewed

Binary file (88.1 kB). View file

fla/ops/utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.12 kB). View file

logs/none_enyj3lod/attempt_0/3/stderr.log ADDED Viewed