zaydzuhri commited on Jan 17

Commit

d9de648

verified ·

1 Parent(s): 6a052b3

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fla/layers/__init__.py +44 -0
fla/models/abc/__pycache__/configuration_abc.cpython-312.pyc +0 -0
fla/models/abc/configuration_abc.py +91 -0
fla/models/delta_net/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/gated_deltaproduct/__pycache__/configuration_gated_deltaproduct.cpython-312.pyc +0 -0
fla/models/gla/__pycache__/modeling_gla.cpython-312.pyc +0 -0
fla/models/gsa/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/gsa/__pycache__/configuration_gsa.cpython-312.pyc +0 -0
fla/models/gsa/__pycache__/modeling_gsa.cpython-312.pyc +0 -0
fla/models/hgrn/__pycache__/modeling_hgrn.cpython-312.pyc +0 -0
fla/models/hgrn2/__pycache__/modeling_hgrn2.cpython-312.pyc +0 -0
fla/models/lightnet/__pycache__/configuration_lightnet.cpython-312.pyc +0 -0
fla/models/linear_attn/__init__.py +12 -0
fla/models/mamba/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/mamba/__pycache__/modeling_mamba.cpython-312.pyc +0 -0
fla/models/mamba/modeling_mamba.py +843 -0
fla/models/nsa/__init__.py +15 -0
fla/models/nsa/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/retnet/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/rwkv6/__init__.py +13 -0
fla/models/rwkv6/configuration_rwkv6.py +82 -0
fla/models/rwkv7/__pycache__/configuration_rwkv7.cpython-312.pyc +0 -0
fla/models/rwkv7/modeling_rwkv7.py +505 -0
fla/models/samba/__init__.py +13 -0
fla/models/samba/configuration_samba.py +92 -0
fla/models/samba/modeling_samba.py +413 -0
fla/models/transformer_dsmtp/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/transformer_dsmtp/__pycache__/configuration_transformer.cpython-312.pyc +0 -0
fla/models/transformer_mtp/__pycache__/configuration_transformer.cpython-312.pyc +0 -0
fla/models/transformer_top/__pycache__/configuration_transformer.cpython-312.pyc +0 -0
fla/ops/common/__pycache__/chunk_delta_h.cpython-312.pyc +0 -0
flame/__pycache__/__init__.cpython-312.pyc +0 -0
flame/models/__init__.py +0 -0
flame/models/parallelize_fla.py +550 -0
flame/tools/__init__.py +0 -0
flame/tools/utils.py +41 -0
flame/utils/checkpoint.py +50 -0
flame/utils/convert_hf_to_dcp.py +34 -0
flame/utils/hf_utils.py +77 -0
torchtitan/components/__pycache__/dataloader.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/ft.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/loss.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/tokenizer.cpython-312.pyc +0 -0
torchtitan/datasets/__pycache__/hf_datasets.cpython-312.pyc +0 -0
torchtitan/datasets/tokenizer/__pycache__/tiktoken.cpython-312.pyc +0 -0
torchtitan/datasets/tokenizer/tiktoken.py +190 -0
torchtitan/distributed/__pycache__/__init__.cpython-312.pyc +0 -0
torchtitan/distributed/__pycache__/parallel_dims.cpython-312.pyc +0 -0
torchtitan/distributed/__pycache__/pipeline.cpython-312.pyc +0 -0

fla/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from .abc import ABCAttention
+from .attn import Attention
+from .based import BasedLinearAttention
+from .bitattn import BitAttention
+from .delta_net import DeltaNet
+from .forgetting_attn import ForgettingAttention
+from .gated_deltanet import GatedDeltaNet
+from .gated_deltaproduct import GatedDeltaProduct
+from .gla import GatedLinearAttention
+from .gsa import GatedSlotAttention
+from .hgrn import HGRNAttention
+from .hgrn2 import HGRN2Attention
+from .lightnet import LightNetAttention
+from .linear_attn import LinearAttention
+from .multiscale_retention import MultiScaleRetention
+from .nsa import NativeSparseAttention
+from .rebased import ReBasedLinearAttention
+from .rwkv6 import RWKV6Attention
+from .rwkv7 import RWKV7Attention
+__all__ = [
+    'ABCAttention',
+    'Attention',
+    'BasedLinearAttention',
+    'BitAttention',
+    'DeltaNet',
+    'ForgettingAttention',
+    'GatedDeltaNet',
+    'GatedDeltaProduct',
+    'GatedLinearAttention',
+    'GatedSlotAttention',
+    'HGRNAttention',
+    'HGRN2Attention',
+    'LightNetAttention',
+    'LinearAttention',
+    'MultiScaleRetention',
+    'NativeSparseAttention',
+    'ReBasedLinearAttention',
+    'RWKV6Attention',
+    'RWKV7Attention',
+]

fla/models/abc/__pycache__/configuration_abc.cpython-312.pyc ADDED Viewed

Binary file (3.6 kB). View file

fla/models/abc/configuration_abc.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# -*- coding: utf-8 -*-
+from typing import Dict, Optional
+from transformers.configuration_utils import PretrainedConfig
+class ABCConfig(PretrainedConfig):
+    model_type = 'abc'
+    keys_to_ignore_at_inference = ['past_key_values']
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        gate_low_rank_dim: int = 16,
+        clamp_min: float = -32,
+        clamp_max: float = 32,
+        hidden_ratio: Optional[int] = 4,
+        intermediate_size: Optional[int] = None,
+        num_hidden_layers: int = 24,
+        num_heads: int = 4,
+        num_slots: Optional[int] = 64,
+        use_short_conv: bool = False,
+        conv_size: int = 4,
+        exapnd_k: float = 0.5,
+        exapnd_v: float = 1,
+        hidden_act: str = "swish",
+        max_position_embeddings: int = 2048,
+        elementwise_affine: Optional[bool] = True,
+        norm_eps: float = 1e-6,
+        use_rope: bool = True,
+        attn: Optional[Dict] = None,
+        use_cache: bool = True,
+        pad_token_id: int = None,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = False,
+        initializer_range: float = 0.006,
+        fuse_norm: bool = True,
+        fuse_swiglu: bool = True,
+        fuse_cross_entropy: bool = True,
+        vocab_size: int = 32000,
+        **kwargs
+    ):
+        self.hidden_size = hidden_size
+        self.gate_low_rank_dim = gate_low_rank_dim
+        self.clamp_min = clamp_min
+        self.clamp_max = clamp_max
+        self.hidden_ratio = hidden_ratio
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_heads = num_heads
+        self.num_slots = num_slots
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.expand_k = exapnd_k
+        self.expand_v = exapnd_v
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.elementwise_affine = elementwise_affine
+        self.norm_eps = norm_eps
+        self.use_rope = use_rope
+        self.attn = attn
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.fuse_norm = fuse_norm
+        self.fuse_swiglu = fuse_swiglu
+        self.fuse_cross_entropy = fuse_cross_entropy
+        self.vocab_size = vocab_size
+        if attn is not None:
+            if not isinstance(attn, Dict):
+                raise ValueError("attn must be a dictionary")
+            if 'layers' not in attn:
+                raise ValueError("Layer indices must be provided to initialize hybrid attention layers")
+            if 'num_heads' not in attn:
+                raise ValueError("Number of heads must be provided to initialize hybrid attention layers")
+            attn['num_kv_heads'] = attn.get('num_kv_heads', attn['num_heads'])
+            attn['qkv_bias'] = attn.get('qkv_bias', False)
+            attn['window_size'] = attn.get('window_size', None)
+            attn['rope_theta'] = attn.get('rope_theta', 10000.)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

fla/models/delta_net/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (697 Bytes). View file

fla/models/gated_deltaproduct/__pycache__/configuration_gated_deltaproduct.cpython-312.pyc ADDED Viewed

Binary file (3.37 kB). View file

fla/models/gla/__pycache__/modeling_gla.cpython-312.pyc ADDED Viewed

Binary file (18.6 kB). View file

fla/models/gsa/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (653 Bytes). View file

fla/models/gsa/__pycache__/configuration_gsa.cpython-312.pyc ADDED Viewed

Binary file (3.84 kB). View file

fla/models/gsa/__pycache__/modeling_gsa.cpython-312.pyc ADDED Viewed

Binary file (18.7 kB). View file

fla/models/hgrn/__pycache__/modeling_hgrn.cpython-312.pyc ADDED Viewed

Binary file (18.8 kB). View file

fla/models/hgrn2/__pycache__/modeling_hgrn2.cpython-312.pyc ADDED Viewed

Binary file (18.9 kB). View file

fla/models/lightnet/__pycache__/configuration_lightnet.cpython-312.pyc ADDED Viewed

Binary file (3.36 kB). View file

fla/models/linear_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# -*- coding: utf-8 -*-
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+from fla.models.linear_attn.configuration_linear_attn import LinearAttentionConfig
+from fla.models.linear_attn.modeling_linear_attn import LinearAttentionForCausalLM, LinearAttentionModel
+AutoConfig.register(LinearAttentionConfig.model_type, LinearAttentionConfig)
+AutoModel.register(LinearAttentionConfig, LinearAttentionModel)
+AutoModelForCausalLM.register(LinearAttentionConfig, LinearAttentionForCausalLM)
+__all__ = ['LinearAttentionConfig', 'LinearAttentionForCausalLM', 'LinearAttentionModel']

fla/models/mamba/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (713 Bytes). View file

fla/models/mamba/__pycache__/modeling_mamba.cpython-312.pyc ADDED Viewed

Binary file (41.5 kB). View file

fla/models/mamba/modeling_mamba.py ADDED Viewed

	@@ -0,0 +1,843 @@

+# coding=utf-8
+# Copyright 2024 state-spaces/mamba org and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MAMBA model."""
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.generation import GenerationMixin
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import ModelOutput, logging
+from transformers.utils.deprecation import deprecate_kwarg
+from fla.models.mamba.configuration_mamba import MambaConfig
+from fla.modules import FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss, RMSNorm
+logger = logging.get_logger(__name__)
+with warnings.catch_warnings():
+    warnings.simplefilter('ignore')
+    try:
+        from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn
+        from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+    except ImportError:
+        selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
+    try:
+        from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+    except ImportError:
+        causal_conv1d_update, causal_conv1d_fn = None, None
+    is_fast_path_available = all((
+        selective_state_update,
+        selective_scan_fn,
+        causal_conv1d_fn,
+        causal_conv1d_update,
+        mamba_inner_fn
+    ))
+class MambaCache:
+    """
+    Cache for mamba model which does not have attention mechanism and key value states.
+    Arguments:
+        config (`PretrainedConfig):
+            The configuration file defining the shape-related attributes required to initialize the static cache.
+        batch_size (`int`):
+            The batch size with which the model will be used. Note that a new instance must be instantiated if a
+            smaller batch size is used.
+        dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
+            The default `dtype` to use when initializing the layer.
+        device (`torch.device` or `str`, *optional*):
+            The device on which the cache should be initialized. Should be the same as the layer.
+    Attributes:
+        dtype: (`torch.dtype`):
+            The default `dtype` used to initializing the cache.
+        intermediate_size: (`int`):
+            Model's intermediate_size taken from config.
+        ssm_state_size: (`int`):
+            Model's state_size taken from config.
+        conv_kernel_size: (`int`):
+            Model's convolution kernel size taken from config
+        conv_states: (`torch.Tensor`):
+            A tensor of shape `[layer_idx, batch_size, intermediate_size, conv_kernel_size]` that holds convolutional states.
+        ssm_states: (`torch.Tensor`):
+            A tensor of shape `[layer_idx, batch_size, intermediate_size, ssm_state_size]` that holds ssm states
+    Example:
+        ```python
+        >>> from transformers import AutoTokenizer, MambaForCausalLM, MambaCache
+        >>> model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
+        >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt")
+        >>> # Prepare a cache class and pass it to model's forward
+        >>> past_key_values = MambaCache(config=model.config, batch_size=1, device=model.device, dtype=model.dtype)
+        >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
+        >>> outputs.past_key_values
+        MambaCache()
+        ```
+    """
+    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        batch_size: int = None,
+        dtype: torch.dtype = torch.float16,
+        device: Optional[Union[torch.device, str]] = None,
+        max_batch_size: Optional[int] = None,
+    ):
+        if max_batch_size is not None:
+            logger.warning_once(
+                f"The 'max_batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in "
+                "v4.46. Use the more precisely named 'batch_size' argument instead."
+            )
+        self.dtype = dtype
+        self.batch_size = batch_size or max_batch_size
+        self.intermediate_size = config.intermediate_size
+        self.ssm_state_size = config.state_size
+        self.conv_kernel_size = config.conv_kernel
+        self.conv_states: torch.Tensor = torch.zeros(
+            config.num_hidden_layers,
+            self.batch_size,
+            self.intermediate_size,
+            self.conv_kernel_size,
+            device=device,
+            dtype=dtype,
+        )
+        self.ssm_states: torch.Tensor = torch.zeros(
+            config.num_hidden_layers,
+            self.batch_size,
+            self.intermediate_size,
+            self.ssm_state_size,
+            device=device,
+            dtype=dtype,
+        )
+        torch._dynamo.mark_static_address(self.conv_states)
+        torch._dynamo.mark_static_address(self.ssm_states)
+    def update_conv_state(
+        self, layer_idx: int, new_conv_state: torch.Tensor, cache_position: torch.LongTensor
+    ) -> torch.Tensor:
+        conv_state = self.conv_states[layer_idx]
+        cache_position = cache_position.clamp(0, self.conv_kernel_size - 1)
+        conv_state = conv_state.roll(shifts=-1, dims=-1)
+        conv_state[:, :, cache_position] = new_conv_state.to(conv_state.device)
+        self.conv_states[layer_idx].zero_()
+        self.conv_states[layer_idx] += conv_state
+        return self.conv_states[layer_idx]
+    def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor):
+        self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states.device)
+        return self.ssm_states[layer_idx]
+    def reset(self):
+        self.conv_states.zero_()
+        self.ssm_states.zero_()
+class MambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+    """
+    def __init__(self, config: MambaConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.state_size
+        self.conv_kernel_size = config.conv_kernel
+        self.intermediate_size = config.intermediate_size
+        self.time_step_rank = int(config.time_step_rank)
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.use_conv_bias
+        self.conv1d = nn.Conv1d(
+            in_channels=self.intermediate_size,
+            out_channels=self.intermediate_size,
+            bias=config.use_conv_bias,
+            kernel_size=config.conv_kernel,
+            groups=self.intermediate_size,
+            padding=config.conv_kernel - 1,
+        )
+        self.activation = config.hidden_act
+        self.act = ACT2FN[config.hidden_act]
+        # projection of the input hidden states
+        self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias)
+        # selective projection used to make dt, B and C input dependant
+        self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias=False)
+        # time step projection (discretization)
+        self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias=True)
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.ssm_state_size + 1, dtype=torch.float32)[None, :]
+        A = A.expand(self.intermediate_size, -1).contiguous()
+        self.A_log = nn.Parameter(torch.log(A))
+        self.D = nn.Parameter(torch.ones(self.intermediate_size))
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
+        self.use_bias = config.use_bias
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because on of "
+                "`(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                " is None. Falling back to the naive implementation. "
+                "To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d"
+            )
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[MambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states).transpose(1, 2)
+        if self.training and cache_params is None:  # Doesn't support outputting the states -> used for training
+            contextualized_states = mamba_inner_fn(
+                projected_states,
+                self.conv1d.weight,
+                self.conv1d.bias if self.use_conv_bias else None,
+                self.x_proj.weight,
+                self.dt_proj.weight,
+                self.out_proj.weight,
+                self.out_proj.bias.float() if self.use_bias else None,
+                -torch.exp(self.A_log.float()),
+                None,  # input-dependent B
+                None,  # input-dependent C
+                self.D.float(),
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+            )
+        else:
+            hidden_states, gate = projected_states.chunk(2, dim=1)
+            if attention_mask is not None:
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+            # 2. Convolution sequence transformation
+            conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
+            if cache_params is not None and cache_position[0] > 0:
+                hidden_states = causal_conv1d_update(
+                    hidden_states.squeeze(-1),
+                    cache_params.conv_states[self.layer_idx],
+                    conv_weights,
+                    self.conv1d.bias,
+                    self.activation,
+                )
+                hidden_states = hidden_states.unsqueeze(-1)
+            else:
+                if cache_params is not None:
+                    conv_states = nn.functional.pad(
+                        hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0)
+                    )
+                    cache_params.update_conv_state(self.layer_idx, conv_states, cache_position)
+                hidden_states = causal_conv1d_fn(
+                    hidden_states, conv_weights, self.conv1d.bias, activation=self.activation
+                )
+            if attention_mask is not None:
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+            # 3. State Space Model sequence transformation
+            # 3.a. input varying initialization of time_step, B and C
+            ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))
+            time_step, B, C = torch.split(
+                ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+            )
+            discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)
+            A = -torch.exp(self.A_log.float())
+            # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+            time_proj_bias = self.dt_proj.bias.float() if hasattr(self.dt_proj, "bias") else None
+            if cache_params is not None and cache_position[0] > 0:
+                scan_outputs = selective_state_update(
+                    cache_params.ssm_states[self.layer_idx],
+                    hidden_states[..., 0],
+                    discrete_time_step[..., 0],
+                    A,
+                    B[:, 0],
+                    C[:, 0],
+                    self.D,
+                    gate[..., 0],
+                    time_proj_bias,
+                    dt_softplus=True,
+                ).unsqueeze(-1)
+            else:
+                scan_outputs, ssm_state = selective_scan_fn(
+                    hidden_states,
+                    discrete_time_step,
+                    A,
+                    B.transpose(1, 2),
+                    C.transpose(1, 2),
+                    self.D.float(),
+                    gate,
+                    time_proj_bias,
+                    delta_softplus=True,
+                    return_last_state=True,
+                )
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.update_ssm_state(self.layer_idx, ssm_state)
+            # 4. Final linear projection
+            contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))
+        return contextualized_states
+    def slow_forward(
+        self,
+        input_states,
+        cache_params: Optional[MambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None
+    ):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # 1. Gated MLP's linear projection
+        # [batch, 2 * intermediate_size, seq_len]
+        projected_states = self.in_proj(input_states).transpose(1, 2)
+        hidden_states, gate = projected_states.chunk(2, dim=1)
+        if attention_mask is not None:
+            hidden_states = hidden_states * attention_mask.unsqueeze(1)
+        # 2. Convolution sequence transformation
+        if cache_params is not None:
+            ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            ssm_state = ssm_state.to(hidden_states.device)
+            # use `cache_position.shape[0]` to check whether we are in prefill
+            # stage, it's equivalent to check `cache_position[0] == 0`, which
+            # breaks dynamo fullgraph constraints
+            if cache_position.shape[0] == self.conv_kernel_size:
+                conv_state = nn.functional.pad(
+                    hidden_states,
+                    (self.conv_kernel_size - hidden_states.shape[-1], 0)
+                )
+                cache_params.update_conv_state(self.layer_idx, conv_state, cache_position)
+                # [batch, intermediate_size, seq_len]
+                hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])
+            else:
+                conv_state = cache_params.update_conv_state(self.layer_idx, hidden_states, cache_position)
+                hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                # [batch, intermediate_size, 1] : decoding
+                hidden_states = self.act(hidden_states).to(dtype).unsqueeze(-1)
+        else:
+            ssm_state = torch.zeros(
+                (batch_size, self.intermediate_size, self.ssm_state_size),
+                device=hidden_states.device, dtype=dtype
+            )
+            # [batch, intermediate_size, seq_len]
+            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])
+        if attention_mask is not None:
+            hidden_states = hidden_states * attention_mask.unsqueeze(1)
+        # 3. State Space Model sequence transformation
+        # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
+        ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+        # [batch, seq_len, intermediate_size]
+        discrete_time_step = self.dt_proj(time_step)
+        # [batch, intermediate_size, seq_len]
+        discrete_time_step = nn.functional.softplus(discrete_time_step).transpose(1, 2)
+        # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
+        # [intermediate_size, ssm_state_size]
+        A = -torch.exp(self.A_log.float())
+        # [batch, intermediate_size, seq_len, ssm_state_size]
+        discrete_A = torch.exp(A[None, :, None, :] * discrete_time_step[:, :, :, None])
+        # [batch, intermediate_size, seq_len, ssm_state_size]
+        discrete_B = discrete_time_step[:, :, :, None] * B[:, None, :, :].float()
+        deltaB_u = discrete_B * hidden_states[:, :, :, None].float()
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        scan_outputs = []
+        for i in range(seq_len):
+            # [batch, intermediade_size, ssm_state]
+            ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]
+            # [batch, intermediade_size, 1]
+            scan_output = torch.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1))
+            scan_outputs.append(scan_output[:, :, 0])
+        # [batch, seq_len, intermediade_size]
+        scan_output = torch.stack(scan_outputs, dim=-1)
+        scan_output = scan_output + (hidden_states * self.D[None, :, None])
+        scan_output = (scan_output * self.act(gate))
+        if cache_params is not None:
+            cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+        # 4. Final linear projection
+        # [batch, seq_len, hidden_size]
+        contextualized_states = self.out_proj(scan_output.transpose(1, 2))
+        return contextualized_states
+    # fmt: on
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[MambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        if is_fast_path_available and "cuda" in self.x_proj.weight.device.type:
+            return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask)
+        return self.slow_forward(hidden_states, cache_params, cache_position, attention_mask)
+class MambaBlock(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.residual_in_fp32 = config.residual_in_fp32
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.mixer = MambaMixer(config, layer_idx=layer_idx)
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[MambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if self.residual_in_fp32:
+            residual = residual.to(torch.float32)
+        hidden_states = self.mixer(
+            hidden_states, cache_params=cache_params, cache_position=cache_position, attention_mask=attention_mask
+        )
+        hidden_states = residual + hidden_states
+        if self.residual_in_fp32:
+            hidden_states = hidden_states.to(dtype=self.norm.weight.dtype)
+        return hidden_states
+class MambaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = MambaConfig
+    base_model_prefix = "backbone"
+    _no_split_modules = ["MambaBlock", "MambaMixer"]
+    supports_gradient_checkpointing = True
+    _is_stateful = True
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                if not getattr(module.bias, "_no_reinit", False):
+                    nn.init.zeros_(module.bias)
+        elif isinstance(module, MambaMixer):
+            module.A_log._no_weight_decay = True
+            module.D._no_weight_decay = True
+            dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
+            if self.config.time_step_init_scheme == "constant":
+                nn.init.constant_(module.dt_proj.weight, dt_init_std)
+            elif self.config.time_step_init_scheme == "random":
+                nn.init.uniform_(module.dt_proj.weight, -dt_init_std, dt_init_std)
+            dt = torch.exp(
+                torch.rand(self.config.intermediate_size)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            with torch.no_grad():
+                module.dt_proj.bias.data = nn.Parameter(inv_dt.to(module.dt_proj.bias.device))
+            module.dt_proj.bias._no_reinit = True
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=self.config.initializer_range)
+        elif hasattr(module, 'reset_parameters'):
+            module.reset_parameters()
+        if self.config.rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            for name, p in module.named_parameters():
+                if name in ["out_proj.weight"]:
+                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                    # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                    # We need to reinit p since this code could be called multiple times
+                    # Having just p *= scale would repeatedly scale it down
+                    nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                    with torch.no_grad():
+                        p /= math.sqrt(self.config.num_hidden_layers)
+@dataclass
+class MambaOutput(ModelOutput):
+    """
+    Class for the MAMBA model outputs.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        cache_params (`MambaCache`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+            Includes both the State space model state matrices after the selective scan, and the Convolutional states
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*,
+            returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    cache_params: Optional[MambaCache] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class MambaCausalLMOutput(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        cache_params (`MambaCache`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+            Includes both the State space model state matrices after the selective scan, and the Convolutional states
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*,
+            returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    cache_params: Optional[MambaCache] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+class MambaModel(MambaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([MambaBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        self.norm_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        # Initialize weights and apply final processing
+        self._register_load_state_dict_pre_hook(self.load_hook)
+        self.post_init()
+    def load_hook(self, state_dict, prefix, *args):
+        for k in state_dict:
+            if "embedding." in k:
+                state_dict[k.replace("embedding.", "embeddings.")] = state_dict.pop(k)
+                break
+    def get_input_embeddings(self):
+        return self.embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        cache_params: Optional[MambaCache] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, MambaOutput]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):  # ^ is python for xor
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+        if self.gradient_checkpointing and self.training and use_cache:
+            use_cache = False
+        if use_cache:
+            if cache_params is None:
+                cache_params = MambaCache(
+                    self.config, inputs_embeds.size(0), device=inputs_embeds.device, dtype=inputs_embeds.dtype
+                )
+                cache_position = torch.arange(0, self.config.conv_kernel, device=inputs_embeds.device)
+            elif cache_position is None:
+                # cases when we do manual forward instead of using `model.generate` which will initiate
+                # `cache_position` and makes sure it is not None, throw error here instead of doing some
+                # hack to conjecture the current cache position
+                raise ValueError(
+                    "You have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, "
+                    "you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will "
+                    "be initialized for you automatically"
+                )
+        else:
+            cache_params = None
+        hidden_states = inputs_embeds
+        all_hidden_states = () if output_hidden_states else None
+        for mixer_block in self.layers:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    mixer_block.__call__, hidden_states, cache_params, cache_position, attention_mask
+                )
+            else:
+                hidden_states = mixer_block(
+                    hidden_states,
+                    cache_params=cache_params,
+                    cache_position=cache_position,
+                    attention_mask=attention_mask,
+                )
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+        hidden_states = self.norm_f(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None)
+        return MambaOutput(
+            last_hidden_state=hidden_states,
+            cache_params=cache_params if use_cache else None,
+            hidden_states=all_hidden_states,
+        )
+class MambaForCausalLM(MambaPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.backbone = MambaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.criterion = None
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def get_input_embeddings(self):
+        return self.backbone.get_input_embeddings()
+    def set_input_embeddings(self, new_embeddings):
+        return self.backbone.set_input_embeddings(new_embeddings)
+    def _update_model_kwargs_for_generation(
+        self, outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        num_new_tokens: int = 1,
+        **kwargs
+    ) -> Dict[str, Any]:
+        model_kwargs["cache_params"] = outputs.get("cache_params", None)
+        if (
+            model_kwargs.get("use_cache", True)
+            and "cache_position" in model_kwargs
+            and model_kwargs["cache_position"] is not None
+        ):
+            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+        return model_kwargs
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        inputs_embeds=None,
+        use_cache=None,
+        cache_params: Optional[MambaCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        logits_to_keep: Optional[int] = None,
+        **kwargs,
+    ):
+        if use_cache:
+            # `cache_position` should have been initialized in `generate`
+            if cache_position is None:
+                raise ValueError(
+                    "`cache_position` should not be None as it should have been initialized in "
+                    "`model.generate`, you are responsible for passing in a valid `cache_position` if "
+                    "you are calling `prepare_inputs_for_generation` directly with `use_cache=True`"
+                )
+            if cache_position[0] > 0:
+                input_ids = input_ids[:, -1].unsqueeze(-1)
+                if attention_mask is not None:
+                    attention_mask = None
+            else:
+                # we initialize the `cache_position` to full size of `conv_states` at prefill stage
+                # considering padding will be applied when input length is shorter, and truncation
+                # will be applied when it is longer, so it will be equivalent to always have it match
+                # the length of `cache_params.conv_states`, which is `config.conv_kernel`
+                cache_position = torch.arange(0, self.config.conv_kernel, device=input_ids.device)
+        if inputs_embeds is not None and cache_params is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}
+        if logits_to_keep is not None:
+            model_inputs['logits_to_keep'] = logits_to_keep
+        model_inputs.update({
+            'cache_params': cache_params,
+            'use_cache': use_cache,
+            'cache_position': cache_position,
+            'attention_mask': attention_mask,
+            'logits_to_keep': logits_to_keep,
+        })
+        return model_inputs
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_params: Optional[MambaCache] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        logits_to_keep: Optional[int] = 0,
+        **kwargs,  # for now we need this for generation
+    ) -> Union[Tuple, MambaCausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        mamba_outputs = self.backbone(
+            input_ids,
+            cache_params=cache_params,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            attention_mask=attention_mask,
+        )
+        hidden_states = mamba_outputs[0]
+        fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training
+        loss, logits = None, None
+        if not fuse_linear_and_cross_entropy or labels is None:
+            logits = self.lm_head(hidden_states if logits_to_keep is None else hidden_states[:, -logits_to_keep:])
+        if labels is not None:
+            if getattr(self, 'criterion', None) is None:
+                if fuse_linear_and_cross_entropy:
+                    criterion = FusedLinearCrossEntropyLoss()
+                elif self.config.fuse_cross_entropy:
+                    criterion = FusedCrossEntropyLoss(inplace_backward=True)
+                else:
+                    criterion = nn.CrossEntropyLoss()
+            else:
+                criterion = self.criterion
+            # Enable model parallelism
+            labels = labels.to(hidden_states.device)
+            labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], criterion.ignore_index)), 1)
+            if fuse_linear_and_cross_entropy:
+                loss = criterion(hidden_states, labels, self.lm_head.weight, self.lm_head.bias)
+            else:
+                loss = criterion(logits.view(labels.numel(), -1), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + mamba_outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return MambaCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            cache_params=mamba_outputs.cache_params,
+            hidden_states=mamba_outputs.hidden_states,
+        )

fla/models/nsa/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# -*- coding: utf-8 -*-
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+from fla.models.nsa.configuration_nsa import NSAConfig
+from fla.models.nsa.modeling_nsa import NSAForCausalLM, NSAModel
+AutoConfig.register(NSAConfig.model_type, NSAConfig)
+AutoModel.register(NSAConfig, NSAModel)
+AutoModelForCausalLM.register(NSAConfig, NSAForCausalLM)
+__all__ = [
+    'NSAConfig', 'NSAModel', 'NSAForCausalLM',
+]

fla/models/nsa/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (653 Bytes). View file

fla/models/retnet/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (678 Bytes). View file

fla/models/rwkv6/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# -*- coding: utf-8 -*-
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+from fla.models.rwkv6.configuration_rwkv6 import RWKV6Config
+from fla.models.rwkv6.modeling_rwkv6 import RWKV6ForCausalLM, RWKV6Model
+AutoConfig.register(RWKV6Config.model_type, RWKV6Config, True)
+AutoModel.register(RWKV6Config, RWKV6Model, True)
+AutoModelForCausalLM.register(RWKV6Config, RWKV6ForCausalLM, True)
+__all__ = ['RWKV6Config', 'RWKV6ForCausalLM', 'RWKV6Model']

fla/models/rwkv6/configuration_rwkv6.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# -*- coding: utf-8 -*-
+from typing import Dict, Optional
+from transformers.configuration_utils import PretrainedConfig
+class RWKV6Config(PretrainedConfig):
+    model_type = 'rwkv6'
+    keys_to_ignore_at_inference = ['past_key_values']
+    def __init__(
+        self,
+        attn_mode: str = "chunk",
+        hidden_size: int = 2048,
+        expand_k: int = 0.5,
+        expand_v: int = 1,
+        hidden_ratio: Optional[int] = 3.5,
+        intermediate_size: Optional[int] = None,
+        num_hidden_layers: int = 24,
+        num_heads: int = 4,
+        proj_low_rank_dim: int = 32,
+        gate_low_rank_dim: int = 64,
+        hidden_act: str = "sqrelu",
+        max_position_embeddings: int = 2048,
+        norm_first: bool = True,
+        norm_bias: bool = True,
+        norm_eps: float = 1e-5,
+        attn: Optional[Dict] = None,
+        use_cache: bool = True,
+        pad_token_id: int = None,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = False,
+        initializer_range: float = 0.006,
+        fuse_norm: bool = True,
+        fuse_cross_entropy: bool = True,
+        vocab_size: int = 32000,
+        **kwargs
+    ):
+        self.attn_mode = attn_mode
+        self.hidden_size = hidden_size
+        self.expand_k = expand_k
+        self.expand_v = expand_v
+        self.hidden_ratio = hidden_ratio
+        self.intermediate_size = intermediate_size
+        self.norm_first = norm_first
+        self.num_hidden_layers = num_hidden_layers
+        self.num_heads = num_heads
+        self.proj_low_rank_dim = proj_low_rank_dim
+        self.gate_low_rank_dim = gate_low_rank_dim
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.norm_bias = norm_bias
+        self.norm_eps = norm_eps
+        self.attn = attn
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.fuse_norm = fuse_norm
+        self.fuse_cross_entropy = fuse_cross_entropy
+        self.vocab_size = vocab_size
+        if attn is not None:
+            if not isinstance(attn, Dict):
+                raise ValueError("attn must be a dictionary")
+            if 'layers' not in attn:
+                raise ValueError("Layer indices must be provided to initialize hybrid attention layers")
+            if 'num_heads' not in attn:
+                raise ValueError("Number of heads must be provided to initialize hybrid attention layers")
+            attn['num_kv_heads'] = attn.get('num_kv_heads', attn['num_heads'])
+            attn['qkv_bias'] = attn.get('qkv_bias', False)
+            attn['window_size'] = attn.get('window_size', None)
+            attn['rope_theta'] = attn.get('rope_theta', 10000.)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

fla/models/rwkv7/__pycache__/configuration_rwkv7.cpython-312.pyc ADDED Viewed

Binary file (4.24 kB). View file

fla/models/rwkv7/modeling_rwkv7.py ADDED Viewed

	@@ -0,0 +1,505 @@

+# -*- coding: utf-8 -*-
+from __future__ import annotations
+import math
+import warnings
+from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.utils.deprecation import deprecate_kwarg
+from fla.layers.attn import Attention
+from fla.layers.rwkv7 import RWKV7Attention
+from fla.models.rwkv7.configuration_rwkv7 import RWKV7Config
+from fla.models.utils import Cache
+from fla.modules import FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss, LayerNorm
+from fla.modules.activations import ACT2FN
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+logger = logging.get_logger(__name__)
+class RWKV7FeedForward(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_ratio: Optional[int] = None,
+        intermediate_size: Optional[int] = None,
+        hidden_act: str = 'sqrelu',
+        layer_idx: int = None
+    ) -> RWKV7FeedForward:
+        super().__init__()
+        self.hidden_size = hidden_size
+        if hidden_ratio is None:
+            hidden_ratio = 4
+        if intermediate_size is None:
+            intermediate_size = int(hidden_size * hidden_ratio)
+            intermediate_size = 32 * ((intermediate_size + 32 - 1) // 32)
+        self.hidden_ratio = hidden_ratio
+        self.intermediate_size = intermediate_size
+        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
+        self.x_k = nn.Parameter(torch.zeros(hidden_size))
+        self.key = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.value = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+        self.layer_idx = layer_idx
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        state: Optional[Cache] = None
+    ) -> torch.Tensor:
+        if attention_mask is not None:
+            x = x.mul(attention_mask[:, -x.shape[-2]:, None])
+        if x.shape[1] == 1 and state is not None and state[self.layer_idx]['ffn_state'] is not None:
+            shifted = state[self.layer_idx]['ffn_state'].unsqueeze(1)
+        else:
+            shifted = self.time_shift(x)
+            if state is not None and state[self.layer_idx]['ffn_state'] is not None:
+                shifted[:, 0] = state[self.layer_idx]['ffn_state'][-1]
+        if state is not None:
+            # no need to update the offset twice
+            state.update(ffn_state=x[:, -1], layer_idx=self.layer_idx, offset=0)
+        return self.value(self.act_fn(self.key(x.addcmul(shifted - x, self.x_k)))), state
+class RWKV7Block(nn.Module):
+    def __init__(
+        self,
+        config: RWKV7Config,
+        layer_idx: int
+    ) -> RWKV7Block:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if config.norm_first and layer_idx == 0:
+            self.pre_norm = (LayerNorm if config.fuse_norm else nn.LayerNorm)(
+                config.hidden_size,
+                bias=config.norm_bias,
+                eps=config.norm_eps
+            )
+        self.attn_norm = (LayerNorm if config.fuse_norm else nn.LayerNorm)(
+            config.hidden_size,
+            bias=config.norm_bias,
+            eps=config.norm_eps
+        )
+        if config.attn is not None and layer_idx in config.attn['layers']:
+            self.attn = Attention(
+                hidden_size=config.hidden_size,
+                num_heads=config.attn['num_heads'],
+                num_kv_heads=config.attn['num_kv_heads'],
+                qkv_bias=config.attn['qkv_bias'],
+                window_size=config.attn['window_size'],
+                rope_theta=config.attn['rope_theta'],
+                max_position_embeddings=config.max_position_embeddings,
+                layer_idx=layer_idx
+            )
+        else:
+            self.attn = RWKV7Attention(
+                mode=config.attn_mode,
+                hidden_size=config.hidden_size,
+                head_dim=config.head_dim,
+                num_heads=config.num_heads,
+                decay_low_rank_dim=config.decay_low_rank_dim,
+                gate_low_rank_dim=config.gate_low_rank_dim,
+                a_low_rank_dim=config.a_low_rank_dim,
+                v_low_rank_dim=config.v_low_rank_dim,
+                norm_eps=config.norm_eps,
+                fuse_norm=config.fuse_norm,
+                layer_idx=layer_idx,
+                value_dim=config.value_dim[layer_idx]
+            )
+        self.ffn_norm = (LayerNorm if config.fuse_norm else nn.LayerNorm)(
+            config.hidden_size,
+            bias=config.norm_bias,
+            eps=config.norm_eps
+        )
+        self.ffn = RWKV7FeedForward(
+            hidden_size=config.hidden_size,
+            hidden_ratio=config.hidden_ratio,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            layer_idx=layer_idx
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        v_first: torch.Tensor = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = self.pre_norm(hidden_states) if hasattr(self, 'pre_norm') else hidden_states
+        hidden_states = self.attn_norm(residual)
+        hidden_states, attentions, past_key_values, v_first = self.attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            v_first=v_first,
+            **kwargs
+        )
+        if self.config.fuse_norm:
+            hidden_states, residual = self.ffn_norm(hidden_states, residual, True)
+        else:
+            hidden_states = residual + hidden_states
+            residual = hidden_states
+            hidden_states = self.ffn_norm(hidden_states)
+        hidden_states, past_key_values = self.ffn(hidden_states, attention_mask, past_key_values)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states, attentions, past_key_values, v_first)
+        return outputs
+class RWKV7PreTrainedModel(PreTrainedModel):
+    config_class = RWKV7Config
+    base_model_prefix = 'model'
+    supports_gradient_checkpointing = True
+    _no_split_modules = ['RWKV7Block']
+    _supports_cache_class = True
+    _skip_keys_device_placement = ["past_key_values"]
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+    def _init_weights(
+        self,
+        module: nn.Module,
+        rescale_prenorm_residual: bool = True,
+        num_residuals_per_layer: int = 2,
+    ):
+        warnings.warn(
+            "RWKV-7 employs a carefully designed initialization strategy tailored to its architecture. "
+            "The detailed initialization scheme is currently not implemented here but can be found in the "
+            "official code repository. We emphasize that using the recommended initialization is essential "
+            "for replicating the results in RWKV-7 paper. Deviations from the prescribed initialization "
+            "may lead to performance degradation.\n"
+            "Alternatively, please generate initial weights from the official RWKV code repository, and "
+            "convert the PyTorch checkpoint into FLA supported format."
+        )
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Parameter):
+            nn.init.normal_(module, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+        elif hasattr(module, 'reset_parameters'):
+            module.reset_parameters()
+        if rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            p = None
+            if hasattr(module, 'o_proj'):
+                p = module.o_proj.weight
+            elif hasattr(module, 'down_proj'):
+                p = module.down_proj.weight
+            if p is not None:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                # We need to reinit p since this code could be called multiple times
+                # Having just p *= scale would repeatedly scale it down
+                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                with torch.no_grad():
+                    p /= math.sqrt(num_residuals_per_layer * self.config.num_hidden_layers)
+class RWKV7Model(RWKV7PreTrainedModel):
+    def __init__(self, config: RWKV7Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([RWKV7Block(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.norm = (LayerNorm if config.fuse_norm else nn.LayerNorm)(
+            config.hidden_size,
+            bias=config.norm_bias,
+            eps=config.norm_eps
+        )
+        self.gradient_checkpointing = False
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings = value
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,  # noqa
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs: Unpack[Dict]
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        if output_attentions:
+            warnings.warn("`RWKV7Model` does not `output_attentions` now, setting it to `False`.")
+            output_attentions = False
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+        hidden_states = inputs_embeds
+        if use_cache and not isinstance(past_key_values, Cache):
+            past_key_values = Cache.from_legacy_cache(past_key_values)
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+            use_cache = False
+        all_hidden_states = () if output_hidden_states else None
+        all_attns = () if output_attentions else None
+        v_first = torch.zeros_like(hidden_states)
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                hidden_states, attentions, past_key_values, v_first = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    past_key_values,
+                    use_cache,
+                    output_attentions,
+                    v_first,
+                    **kwargs
+                )
+            else:
+                hidden_states, attentions, past_key_values, v_first = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    past_key_values=past_key_values,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    v_first=v_first,
+                    **kwargs
+                )
+            if output_attentions:
+                all_attns += (attentions,)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        if not return_dict:
+            return tuple(i for i in [hidden_states, past_key_values, all_hidden_states, all_attns] if i is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attns
+        )
+class RWKV7ForCausalLM(RWKV7PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = RWKV7Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.criterion = None
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embeddings
+    def set_input_embeddings(self, value):
+        self.model.embeddings = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def generate(self, *args, **kwargs):
+        try:
+            return super().generate(*args, **kwargs)
+        except AttributeError as exception:
+            if 'past_key_values' in str(exception):
+                raise AttributeError(
+                    f"You tried to call `generate` with a decoding strategy that manipulates `past_key_values`, "
+                    f"which is not supported for {self.__class__.__name__}. "
+                    f"Try another generation strategy instead. "
+                    f"For the available generation strategies, check this doc: "
+                    f"https://huggingface.co/docs/transformers/en/generation_strategies#decoding-strategies"
+                )
+            else:
+                raise exception
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: bool = True,
+        logits_to_keep: Optional[int] = None,
+        **kwargs
+    ):
+        # only last token for `inputs_ids` if the `past_key_values` is not empty.
+        if past_key_values is not None and len(past_key_values) > 0:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and len(past_key_values) == 0:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
+            # recompiles graphs as the stride of the inputs is a guard.
+            # Ref: https://github.com/huggingface/transformers/pull/29114
+            # TODO: use `next_tokens` directly instead.
+            model_inputs = {'input_ids': input_ids.contiguous()}
+        if logits_to_keep is not None:
+            model_inputs['logits_to_keep'] = logits_to_keep
+        model_inputs.update({
+            'past_key_values': past_key_values,
+            'use_cache': use_cache,
+            'attention_mask': attention_mask,
+        })
+        return model_inputs
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        labels: Optional[torch.LongTensor] = None,
+        shift_labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Optional[int] = 0,
+        **kwargs: Unpack[Dict]
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs
+        )
+        hidden_states = outputs[0]
+        fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training
+        loss, logits = None, None
+        has_labels = (labels is not None) or (shift_labels is not None)
+        if not (fuse_linear_and_cross_entropy and has_labels):
+            logits = self.lm_head(hidden_states if logits_to_keep is None else hidden_states[:, -logits_to_keep:])
+        if has_labels:
+            if getattr(self, 'criterion', None) is None:
+                if fuse_linear_and_cross_entropy:
+                    criterion = FusedLinearCrossEntropyLoss()
+                elif self.config.fuse_cross_entropy:
+                    criterion = FusedCrossEntropyLoss(inplace_backward=True)
+                else:
+                    criterion = nn.CrossEntropyLoss()
+            else:
+                criterion = self.criterion
+            # shift_labels: See https://github.com/huggingface/transformers/pull/36607/files.
+            if shift_labels is None:
+                shift_labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], criterion.ignore_index)), 1)
+            shift_labels = shift_labels.to(hidden_states.device)
+            if fuse_linear_and_cross_entropy:
+                loss = criterion(hidden_states, shift_labels, self.lm_head.weight, self.lm_head.bias)
+            else:
+                loss = criterion(logits.view(shift_labels.numel(), -1), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

fla/models/samba/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# -*- coding: utf-8 -*-
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+from fla.models.samba.configuration_samba import SambaConfig
+from fla.models.samba.modeling_samba import SambaBlock, SambaForCausalLM, SambaModel
+AutoConfig.register(SambaConfig.model_type, SambaConfig, True)
+AutoModel.register(SambaConfig, SambaModel, True)
+AutoModelForCausalLM.register(SambaConfig, SambaForCausalLM, True)
+__all__ = ['SambaConfig', 'SambaForCausalLM', 'SambaModel', 'SambaBlock']

fla/models/samba/configuration_samba.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# -*- coding: utf-8 -*-
+import math
+from typing import Dict, Optional
+from transformers.configuration_utils import PretrainedConfig
+class SambaConfig(PretrainedConfig):
+    model_type = "samba"
+    def __init__(
+        self,
+        hidden_size: int = 2304,
+        state_size: int = 16,
+        num_hidden_layers: int = 18,
+        norm_eps=1e-5,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        expand: int = 2,
+        conv_kernel: int = 4,
+        use_bias: bool = False,
+        use_conv_bias: bool = True,
+        hidden_act: str = "swish",
+        initializer_range: str = 0.02,
+        residual_in_fp32: bool = False,
+        time_step_rank: str = "auto",
+        time_step_scale: float = 1.0,
+        time_step_min: float = 0.001,
+        time_step_max: float = 0.1,
+        time_step_init_scheme: str = "random",
+        time_step_floor: float = 1e-4,
+        max_position_embeddings: int = 2048,
+        attn: Optional[Dict] = {
+            'layers': (1, 3, 5, 7, 9, 11, 13, 15, 17),
+            'num_heads': 18,
+            'num_kv_heads': 18,
+            'qkv_bias': False,
+            'window_size': 2048,
+            'rope_theta': 10000.
+        },
+        hidden_ratio: Optional[int] = 4,
+        rescale_prenorm_residual: bool = False,
+        use_cache: bool = True,
+        fuse_norm: bool = True,
+        fuse_swiglu: bool = True,
+        fuse_cross_entropy: bool = True,
+        vocab_size: int = 32000,
+        tie_word_embeddings: bool = False,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.state_size = state_size
+        self.num_hidden_layers = num_hidden_layers
+        self.norm_eps = norm_eps
+        self.conv_kernel = conv_kernel
+        self.expand = expand
+        self.intermediate_size = int(expand * self.hidden_size)
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.use_bias = use_bias
+        self.use_conv_bias = use_conv_bias
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
+        self.time_step_scale = time_step_scale
+        self.time_step_min = time_step_min
+        self.time_step_max = time_step_max
+        self.time_step_init_scheme = time_step_init_scheme
+        self.time_step_floor = time_step_floor
+        self.max_position_embeddings = max_position_embeddings
+        self.attn = attn
+        self.hidden_ratio = hidden_ratio
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+        self.residual_in_fp32 = residual_in_fp32
+        self.use_cache = use_cache
+        self.fuse_norm = fuse_norm
+        self.fuse_swiglu = fuse_swiglu
+        self.fuse_cross_entropy = fuse_cross_entropy
+        self.vocab_size = vocab_size
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs
+        )

fla/models/samba/modeling_samba.py ADDED Viewed

	@@ -0,0 +1,413 @@

+# -*- coding: utf-8 -*-
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.generation import GenerationMixin
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import ModelOutput, logging
+from transformers.utils.deprecation import deprecate_kwarg
+from fla.layers.attn import Attention
+from fla.models.mamba.modeling_mamba import MambaCache, MambaMixer
+from fla.models.samba.configuration_samba import SambaConfig
+from fla.modules import FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss
+from fla.modules import GatedMLP as SambaMLP
+from fla.modules import RMSNorm
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+logger = logging.get_logger(__name__)
+class SambaBlock(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.mixer_norm = RMSNorm(hidden_size=config.hidden_size, eps=config.norm_eps)
+        if config.attn is not None and layer_idx in config.attn['layers']:
+            self.mixer = Attention(
+                hidden_size=config.hidden_size,
+                num_heads=config.attn['num_heads'],
+                num_kv_heads=config.attn['num_kv_heads'],
+                qkv_bias=config.attn['qkv_bias'],
+                window_size=config.attn['window_size'],
+                rope_theta=config.attn['rope_theta'],
+                max_position_embeddings=config.max_position_embeddings,
+                layer_idx=layer_idx
+            )
+        else:
+            self.mixer = MambaMixer(config, layer_idx=layer_idx)
+        self.mlp_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps)
+        self.mlp = SambaMLP(
+            hidden_size=config.hidden_size,
+            hidden_ratio=config.hidden_ratio,
+            hidden_act=config.hidden_act,
+            fuse_swiglu=config.fuse_swiglu
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[Tuple[torch.Tensor]] = None,
+        **kwargs: Unpack[Dict]
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.mixer_norm(hidden_states)
+        if isinstance(self.mixer, MambaMixer):
+            hidden_states = self.mixer(hidden_states, cache_params=cache_params, **kwargs)
+        else:
+            hidden_states, _, cache_params = self.mixer(hidden_states=hidden_states, past_key_values=cache_params, **kwargs)
+        if self.config.fuse_norm:
+            hidden_states, residual = self.mlp_norm(hidden_states, residual, True)
+        else:
+            hidden_states = residual + hidden_states
+            residual = hidden_states
+            hidden_states = self.mlp_norm(hidden_states)
+        hidden_states = self.mlp(hidden_states, **kwargs)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class SambaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = SambaConfig
+    base_model_prefix = "backbone"
+    _no_split_modules = ["SambaBlock"]
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                if not getattr(module.bias, "_no_reinit", False):
+                    nn.init.zeros_(module.bias)
+        elif isinstance(module, MambaMixer):
+            module.A_log._no_weight_decay = True
+            module.D._no_weight_decay = True
+            dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
+            if self.config.time_step_init_scheme == "constant":
+                nn.init.constant_(module.dt_proj.weight, dt_init_std)
+            elif self.config.time_step_init_scheme == "random":
+                nn.init.uniform_(module.dt_proj.weight, -dt_init_std, dt_init_std)
+            dt = torch.exp(
+                torch.rand(self.config.intermediate_size)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            with torch.no_grad():
+                module.dt_proj.bias.data = nn.Parameter(inv_dt.to(module.dt_proj.bias.device))
+            module.dt_proj.bias._no_reinit = True
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=self.config.initializer_range)
+        elif hasattr(module, 'reset_parameters'):
+            module.reset_parameters()
+        if self.config.rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            for name, p in module.named_parameters():
+                if name in ["out_proj.weight"]:
+                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                    # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                    # We need to reinit p since this code could be called multiple times
+                    # Having just p *= scale would repeatedly scale it down
+                    nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                    with torch.no_grad():
+                        p /= math.sqrt(self.config.num_layers)
+@dataclass
+class SambaOutput(ModelOutput):
+    """
+    Class for the Samba model outputs.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        cache_params (`MambaCache`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+            Includes both the State space model state matrices after the selective scan, and the Convolutional states
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*,
+            returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    cache_params: Optional[MambaCache] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class SambaCausalLMOutput(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        cache_params (`MambaCache`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+            Includes both the State space model state matrices after the selective scan, and the Convolutional states
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*,
+            returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    cache_params: Optional[MambaCache] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+class SambaModel(SambaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([SambaBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        self.norm_f = RMSNorm(config.hidden_size, eps=config.norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        cache_params: Optional[MambaCache] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs: Unpack[Dict]
+    ) -> Union[Tuple, SambaOutput]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):  # ^ is python for xor
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+        if self.gradient_checkpointing and self.training and use_cache:
+            use_cache = False
+        if cache_params is None and use_cache:
+            cache_params = MambaCache(
+                self.config, inputs_embeds.size(0), device=inputs_embeds.device, dtype=inputs_embeds.dtype
+            )
+        hidden_states = inputs_embeds
+        all_hidden_states = () if output_hidden_states else None
+        for mixer_block in self.layers:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    mixer_block.__call__,
+                    hidden_states,
+                    cache_params,
+                    **kwargs
+                )
+            else:
+                hidden_states = mixer_block(
+                    hidden_states,
+                    cache_params=cache_params,
+                    **kwargs
+                )
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+        if use_cache:
+            cache_params.seqlen_offset += inputs_embeds.shape[1]
+        hidden_states = self.norm_f(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None)
+        return SambaOutput(
+            last_hidden_state=hidden_states,
+            cache_params=cache_params if use_cache else None,
+            hidden_states=all_hidden_states,
+        )
+class SambaForCausalLM(SambaPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.backbone = SambaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.criterion = None
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def get_input_embeddings(self):
+        return self.backbone.get_input_embeddings()
+    def set_input_embeddings(self, new_embeddings):
+        return self.backbone.set_input_embeddings(new_embeddings)
+    def _update_model_kwargs_for_generation(
+        self, outputs: ModelOutput, model_kwargs: Dict[str, Any], **kwargs
+    ) -> Dict[str, Any]:
+        model_kwargs["cache_params"] = outputs.get("cache_params", None)
+        return model_kwargs
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        cache_params:
+        Optional[MambaCache] = None,
+        inputs_embeds=None,
+        attention_mask=None,
+        use_cache: Optional[bool] = True,
+        logits_to_keep: Optional[int] = None,
+        **kwargs: Unpack[Dict]
+    ):
+        # only last token for inputs_ids if the state is passed along.
+        if cache_params is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        if inputs_embeds is not None and cache_params is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        if logits_to_keep is not None:
+            model_inputs['logits_to_keep'] = logits_to_keep
+        model_inputs.update({
+            'cache_params': cache_params,
+            'use_cache': use_cache,
+            'attention_mask': attention_mask,
+            'logits_to_keep': logits_to_keep,
+        })
+        return model_inputs
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,  # noqa
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_params: Optional[MambaCache] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        logits_to_keep: Optional[int] = 0,
+        **kwargs: Unpack[Dict]
+    ) -> Union[Tuple, SambaCausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.backbone(
+            input_ids,
+            cache_params=cache_params,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            use_cache=use_cache,
+            **kwargs
+        )
+        hidden_states = outputs[0]
+        fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training
+        loss, logits = None, None
+        if not fuse_linear_and_cross_entropy or labels is None:
+            logits = self.lm_head(hidden_states if logits_to_keep is None else hidden_states[:, -logits_to_keep:])
+        if labels is not None:
+            if getattr(self, 'criterion', None) is None:
+                if fuse_linear_and_cross_entropy:
+                    criterion = FusedLinearCrossEntropyLoss()
+                elif self.config.fuse_cross_entropy:
+                    criterion = FusedCrossEntropyLoss(inplace_backward=True)
+                else:
+                    criterion = nn.CrossEntropyLoss()
+            else:
+                criterion = self.criterion
+            labels = labels.to(hidden_states.device)
+            labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], criterion.ignore_index)), 1)
+            if fuse_linear_and_cross_entropy:
+                loss = criterion(hidden_states, labels, self.lm_head.weight, self.lm_head.bias)
+            else:
+                loss = criterion(logits.view(labels.numel(), -1), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return SambaCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            cache_params=outputs.cache_params,
+            hidden_states=outputs.hidden_states,
+        )

fla/models/transformer_dsmtp/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (757 Bytes). View file

fla/models/transformer_dsmtp/__pycache__/configuration_transformer.cpython-312.pyc ADDED Viewed

Binary file (2.61 kB). View file

fla/models/transformer_mtp/__pycache__/configuration_transformer.cpython-312.pyc ADDED Viewed

Binary file (2.69 kB). View file

fla/models/transformer_top/__pycache__/configuration_transformer.cpython-312.pyc ADDED Viewed

Binary file (2.8 kB). View file

fla/ops/common/__pycache__/chunk_delta_h.cpython-312.pyc ADDED Viewed

Binary file (23.9 kB). View file

flame/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (152 Bytes). View file

flame/models/__init__.py ADDED Viewed

File without changes

flame/models/parallelize_fla.py ADDED Viewed

	@@ -0,0 +1,550 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This file applies the PT-D parallelisms (except pipeline parallelism) and various
+# training techniques (e.g. activation checkpointing and compile) to the Llama model.
+from collections import defaultdict
+import torch
+import torch.nn as nn
+from torch.distributed import DeviceMesh
+from torch.distributed._composable.fsdp import CPUOffloadPolicy, MixedPrecisionPolicy, fully_shard
+from torch.distributed._composable.replicate import replicate
+from torch.distributed._tensor import Replicate, Shard
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper as ptd_checkpoint_wrapper
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    PrepareModuleOutput,
+    RowwiseParallel,
+    SequenceParallel,
+    parallelize_module
+)
+from fla.modules.fused_linear_cross_entropy import LinearLossParallel
+from fla.modules.mlp import SwiGLULinearParallel
+from fla.modules.parallel import PrepareModuleWeight
+from torchtitan.config_manager import TORCH_DTYPE_MAP, JobConfig
+from torchtitan.distributed.parallel_dims import ParallelDims
+from torchtitan.tools.logging import logger
+def parallelize_fla(
+    model: nn.Module,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+    """
+    Apply tensor parallelism, activation checkpointing, torch.compile, and data
+    parallelism to the model.
+    NOTE: The passed-in model preferably should be on meta device. Otherwise,
+    the model must fit on GPU or CPU memory.
+    """
+    if parallel_dims.tp_enabled:
+        if (
+            job_config.experimental.enable_async_tensor_parallel
+            and not job_config.training.compile
+        ):
+            raise RuntimeError("Async TP requires --training.compile")
+        enable_float8_linear = "float8" in job_config.model.converters
+        apply_tp(
+            model,
+            world_mesh["tp"],
+            loss_parallel=parallel_dims.loss_parallel_enabled,
+            enable_float8=enable_float8_linear,
+            enable_async_tp=job_config.experimental.enable_async_tensor_parallel,
+        )
+    if job_config.activation_checkpoint.mode != "none":
+        apply_ac(model, job_config.activation_checkpoint)
+    # turn on per-block compile after AC wrapping and before FSDP
+    if job_config.training.compile:
+        apply_compile(model)
+    if (
+        parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled
+    ):  # apply FSDP or HSDP, potentially with Context Parallel
+        if parallel_dims.dp_replicate_enabled:
+            dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+        else:
+            dp_mesh_dim_names = ("dp_shard_cp",)
+        apply_fsdp(
+            model,
+            world_mesh[tuple(dp_mesh_dim_names)],
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
+            pp_enabled=parallel_dims.pp_enabled,
+            cpu_offload=job_config.training.enable_cpu_offload,
+            reshard_after_forward_policy=job_config.training.fsdp_reshard_after_forward,
+        )
+        if parallel_dims.dp_replicate_enabled:
+            logger.info("Applied HSDP to the model")
+        else:
+            logger.info("Applied FSDP to the model")
+        if parallel_dims.cp_enabled:
+            logger.info("Applied Context Parallel to the model")
+        if job_config.training.enable_cpu_offload:
+            logger.info("Applied CPU Offloading to the model")
+    elif parallel_dims.dp_replicate_enabled:
+        if world_mesh.ndim > 1:
+            raise RuntimeError("DDP has not supported > 1D parallelism")
+        apply_ddp(
+            model,
+            world_mesh,
+            enable_compile=job_config.training.compile,
+            enable_compiled_autograd=job_config.experimental.enable_compiled_autograd,
+        )
+class TPPlan:
+    def __init__(
+        self,
+        model=None,
+        loss_parallel=False,
+        enable_float8=False,
+    ):
+        self.model = model
+        self.loss_parallel = loss_parallel
+        self.enable_float8 = enable_float8
+        self.base_model_prefix = getattr(model, "base_model_prefix", "model")
+        # TODO(vkuzo): once float8 configuration supports delayed scaling,
+        # add a check here to enforce supported float8 all-gather configurations
+        # TODO(vkuzo): add the items below to __init__.py of torchao.float8 and import from there
+        try:
+            from torchao.float8.float8_tensor_parallel import (
+                Float8ColwiseParallel,
+                Float8RowwiseParallel,
+                PrepareFloat8ModuleInput
+            )
+        except ImportError:
+            Float8ColwiseParallel = None
+            Float8RowwiseParallel = None
+            PrepareFloat8ModuleInput = None
+        if self.enable_float8 and Float8ColwiseParallel is not None:
+            self.rowwise_parallel = Float8RowwiseParallel
+            self.colwise_parallel = Float8ColwiseParallel
+            self.prepare_module_input = PrepareFloat8ModuleInput
+            self.prepare_module_output = PrepareModuleOutput
+        else:
+            self.rowwise_parallel = RowwiseParallel
+            self.colwise_parallel = ColwiseParallel
+            self.prepare_module_input = PrepareModuleInput
+            self.prepare_module_output = PrepareModuleOutput
+    @property
+    def model_plan(self):
+        plans = {
+            f"{self.base_model_prefix}.embeddings": RowwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(1),
+            ),
+            f"{self.base_model_prefix}.norm": SequenceParallel(),
+        }
+        if self.loss_parallel:
+            plans.update(
+                {
+                    "lm_head": ColwiseParallel(
+                        input_layouts=Shard(1),
+                        output_layouts=Shard(-1) if self.loss_parallel else Replicate(),
+                        use_local_output=not self.loss_parallel,
+                    ),
+                }
+            )
+        else:
+            plans.update(
+                {
+                    "lm_head": PrepareModuleWeight(layouts=Replicate()),
+                    "criterion": LinearLossParallel(),
+                }
+            )
+        return plans
+    @property
+    def layer_plan(self):
+        return {
+            "attn_norm": SequenceParallel(),
+            **self.attn_plan,
+            "mlp_norm": SequenceParallel(),
+            **self.mlp_plan,
+        }
+    @property
+    def attn_plan(self):
+        raise NotImplementedError(
+            f"TP plans for token mixing layers of {self.model.config.model_type} not implemented"
+        )
+    @property
+    def mlp_plan(self):
+        return {
+            "mlp": self.prepare_module_input(
+                input_layouts=(Shard(1),),
+                desired_input_layouts=(Replicate(),),
+            ),
+            "mlp.gate_proj": self.colwise_parallel(),
+            "mlp.up_proj": self.colwise_parallel(),
+            "mlp.down_proj": self.rowwise_parallel(output_layouts=Shard(1)),
+            "mlp.swiglu_linear": SwiGLULinearParallel(output_layouts=Shard(1)),
+        }
+class TransformerTPPlan(TPPlan):
+    @property
+    def attn_plan(self):
+        return {
+            "attn": self.prepare_module_input(
+                input_kwarg_layouts={"hidden_states": Shard(1)},
+                desired_input_kwarg_layouts={"hidden_states": Replicate()},
+            ),
+            "attn.q_proj": self.colwise_parallel(),
+            "attn.k_proj": self.colwise_parallel(),
+            "attn.v_proj": self.colwise_parallel(),
+            "attn.o_proj": self.rowwise_parallel(output_layouts=Shard(1)),
+        }
+class GLATPPlan(TPPlan):
+    @property
+    def attn_plan(self):
+        return {
+            "attn": self.prepare_module_input(
+                input_kwarg_layouts={"hidden_states": Shard(1)},
+                desired_input_kwarg_layouts={"hidden_states": Replicate()},
+            ),
+            "attn.q_proj": self.colwise_parallel(),
+            "attn.k_proj": self.colwise_parallel(),
+            "attn.v_proj": self.colwise_parallel(),
+            "attn.g_proj": self.colwise_parallel(),
+            "attn.gk_proj.0": PrepareModuleWeight(layouts=Replicate()),
+            "attn.gk_proj.1": self.colwise_parallel(),
+            "attn.g_norm": SequenceParallel(sequence_dim=-1),
+            "attn.o_proj": self.rowwise_parallel(output_layouts=Shard(1)),
+        }
+TP_PLAN_MAP = {"transformer": TransformerTPPlan, "gla": GLATPPlan}
+def apply_tp(
+    model: nn.Module,
+    tp_mesh: DeviceMesh,
+    loss_parallel: bool,
+    enable_float8: bool,
+    enable_async_tp: bool,
+):
+    """Apply tensor parallelism."""
+    # 1. Parallelize the embedding and shard its outputs (which are the first
+    # transformer block's inputs)
+    # 2. Parallelize the root norm layer over the sequence dim
+    # 3. Parallelize the final linear output layer
+    tp_plan = TP_PLAN_MAP[model.config.model_type](
+        model, loss_parallel=loss_parallel, enable_float8=enable_float8
+    )
+    parallelize_module(model, tp_mesh, tp_plan.model_plan)
+    blocks = get_blocks(model)
+    if blocks is None:
+        logger.warning("No block found for tensor parallelism")
+    else:
+        for _, block in enumerate(blocks):
+            parallelize_module(
+                module=block,
+                device_mesh=tp_mesh,
+                parallelize_plan=tp_plan.layer_plan,
+            )
+    if enable_async_tp:
+        from torch.distributed._symmetric_memory import enable_symm_mem_for_group
+        torch._inductor.config._micro_pipeline_tp = True
+        enable_symm_mem_for_group(tp_mesh.get_group().group_name)
+    logger.info(
+        f"Applied {'Float8 ' if enable_float8 else ''}{'Async ' if enable_async_tp else ''}"
+        "Tensor Parallelism to the model"
+    )
+# for selective op activation checkpointing
+_save_list = {
+    torch.ops.aten.mm.default,
+    torch.ops.aten._scaled_dot_product_efficient_attention.default,
+    torch.ops.aten._scaled_dot_product_flash_attention.default,
+    torch.ops._c10d_functional.reduce_scatter_tensor.default,
+    # for low precision training, it's useful to always save
+    # the result of max, since the absolute maximum is
+    # used to compute the scaling factor for quantization.
+    torch.ops.aten.max.default,
+}
+def _apply_ac_to_block(module: nn.Module, ac_config):
+    valid_ac_modes = ("full", "selective")
+    if ac_config.mode not in valid_ac_modes:
+        raise ValueError(
+            f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}"
+        )
+    if ac_config.mode == "full":
+        return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+    assert ac_config.mode == "selective", f"{ac_config.mode}"
+    use_op_sac = ac_config.selective_ac_option == "op"
+    use_layer_sac = ac_config.selective_ac_option.isdigit()
+    if not use_op_sac and not use_layer_sac:
+        raise ValueError(
+            f"Invalid selective AC option: {ac_config.selective_ac_option}. "
+            f"Valid options: 'op' or a positive int representing layer frequency"
+        )
+    if use_op_sac:
+        from torch.utils.checkpoint import CheckpointPolicy, create_selective_checkpoint_contexts
+        def _get_custom_policy(meta):
+            def _custom_policy(ctx, func, *args, **kwargs):
+                mode = "recompute" if ctx.is_recompute else "forward"
+                mm_count_key = f"{mode}_mm_count"
+                if func == torch.ops.aten.mm.default:
+                    meta[mm_count_key] += 1
+                # Saves output of all compute ops, except every second mm
+                to_save = func in _save_list and not (
+                    func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0
+                )
+                return (
+                    CheckpointPolicy.MUST_SAVE
+                    if to_save
+                    else CheckpointPolicy.PREFER_RECOMPUTE
+                )
+            return _custom_policy
+        def selective_checkpointing_context_fn():
+            meta = defaultdict(int)
+            return create_selective_checkpoint_contexts(_get_custom_policy(meta))
+        return ptd_checkpoint_wrapper(
+            module,
+            context_fn=selective_checkpointing_context_fn,
+            preserve_rng_state=False,
+        )
+    elif use_layer_sac:
+        # Checkpoint every `ac_freq` of the modules passed to this function
+        ac_freq = int(ac_config.selective_ac_option)
+        ptd_checkpoint_wrapper.__dict__.setdefault("_count", 0)
+        ptd_checkpoint_wrapper._count += 1
+        if not ac_freq or ptd_checkpoint_wrapper._count % ac_freq == 0:
+            return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+        else:
+            return module
+def apply_ac(model: nn.Module, ac_config):
+    """Apply activation checkpointing to the model."""
+    blocks = get_blocks(model)
+    if blocks is None:
+        logger.warning("No block found for activation checkpointing")
+        return
+    for layer_id, block in blocks.named_children():
+        block = _apply_ac_to_block(block, ac_config)
+        blocks.register_module(layer_id, block)
+    logger.info(f"Applied {ac_config.mode} activation checkpointing to the model")
+def apply_compile(model: nn.Module):
+    """
+    Apply torch.compile to each block, which makes compilation efficient due to
+    repeated structure. Alternatively one can compile the whole model (after applying DP).
+    """
+    blocks = get_blocks(model)
+    if blocks is None:
+        logger.warning("No block found for torch.compile")
+    else:
+        for layer_id, block in blocks.named_children():
+            block = torch.compile(block)
+            blocks.register_module(layer_id, block)
+        logger.info("Compiling each block with torch.compile")
+    real_model = get_model(model)
+    logger.info("Compiling the embedding, norm, and lm_head layers with torch.compile")
+    embeddings_key = get_components_name(real_model, "tok_embeddings")
+    if embeddings_key is not None:
+        embeddings = torch.compile(getattr(real_model, embeddings_key), fullgraph=True)
+        real_model.register_module(embeddings_key, embeddings)
+    norm_key = get_components_name(real_model, "norm")
+    if norm_key is not None:
+        norm = torch.compile(getattr(real_model, norm_key), fullgraph=True)
+        real_model.register_module(norm_key, norm)
+    lm_head_key = get_components_name(model, "lm_head")
+    if lm_head_key is not None:
+        lm_head = torch.compile(getattr(model, lm_head_key), fullgraph=True)
+        model.register_module(lm_head_key, lm_head)
+    logger.info("Compiling the entire model with torch.compile")
+    model = torch.compile(model)
+def apply_fsdp(
+    model: nn.Module,
+    dp_mesh: DeviceMesh,
+    param_dtype: torch.dtype,
+    reduce_dtype: torch.dtype,
+    pp_enabled: bool,
+    cpu_offload: bool = False,
+    reshard_after_forward_policy: str = "default",
+):
+    """
+    Apply data parallelism (via FSDP2) to the model.
+    Args:
+        model (nn.Module): The model to apply data parallelism to.
+        dp_mesh (DeviceMesh): The device mesh to use for data parallelism.
+        param_dtype (torch.dtype): The data type to use for model parameters.
+        reduce_dtype (torch.dtype): The data type to use for reduction operations.
+        pp_enabled (bool): Whether pipeline parallelism is enabled.
+        cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False.
+        reshard_after_forward_policy (str, optional):
+            The policy to use for resharding after forward pass. Defaults to "default".
+            Other options: "never", "always".
+            - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios.
+            - "always" will enable `reshard_after_forward` for all forward passes.
+            - "never" will disable `reshard_after_forward` for all forward passes.
+    """
+    mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype)
+    fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy}
+    if cpu_offload:
+        fsdp_config["offload_policy"] = CPUOffloadPolicy()
+    blocks = get_blocks(model)
+    if blocks is None:
+        logger.warning("No block found for FSDP")
+    else:
+        total_blocks = len(blocks)
+        for layer_id, block in enumerate(blocks):
+            if reshard_after_forward_policy == "always":
+                reshard_after_forward = True
+            elif reshard_after_forward_policy == "never":
+                reshard_after_forward = False
+            elif reshard_after_forward_policy == "default":
+                if pp_enabled:
+                    # For PP, do not reshard after forward to avoid per-microbatch
+                    # all-gathers, which can be expensive and non-overlapped
+                    reshard_after_forward = False
+                else:
+                    # As an optimization, do not reshard after forward for the last
+                    # transformer block since FSDP would prefetch it immediately
+                    reshard_after_forward = int(layer_id) < total_blocks - 1
+            else:
+                raise ValueError(
+                    f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}."
+                )
+            fully_shard(
+                block,
+                **fsdp_config,
+                reshard_after_forward=reshard_after_forward,
+            )
+    fully_shard(model, **fsdp_config, reshard_after_forward=not pp_enabled)
+def apply_ddp(
+    model: nn.Module,
+    dp_mesh: DeviceMesh,
+    enable_compile: bool,
+    enable_compiled_autograd: bool,
+):
+    if enable_compile:
+        if enable_compiled_autograd:
+            torch._dynamo.config.optimize_ddp = (
+                "python_reducer_without_compiled_forward"
+            )
+        else:
+            torch._dynamo.config.optimize_ddp = "ddp_optimizer"
+    replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100)
+    logger.info("Applied DDP to the model")
+def get_model(model):
+    base_model_prefix = getattr(model, "base_model_prefix", "model")
+    if not hasattr(model, base_model_prefix):
+        return None
+    model = getattr(model, base_model_prefix)
+    return model
+def get_blocks(model):
+    # TODO[flame]: adapt for network not using 'layers' attribute
+    model = get_model(model)
+    if not hasattr(model, "layers"):
+        logger.warning('no "layers" in model can be found')
+        return None
+    return model.layers
+def get_components_name(model, component_name):
+    """
+    We try to catch tok_embeddings, norm layers and lm_head layers
+    We do not catch the layer names in the blocks, for blocks see `get_blocks`
+    We assume the model has the following structure:
+    LlamaForCausalLM:
+        Model:
+            embed_tokens,
+            layers,
+            norm,
+        lm_head
+    ***
+    so, to search 'tok_embeddings' and 'norm' we need to pass `get_model(model)`
+    and for 'lm_head' we need to pass `model`
+    ***
+    """
+    if component_name == "tok_embeddings":
+        if hasattr(model, "tok_embeddings"):
+            return "tok_embeddings"
+        elif hasattr(model, "embed_tokens"):
+            return "embed_tokens"
+        elif hasattr(model, "embeddings"):
+            return "embeddings"
+        else:
+            logger.warning("No tok_embeddings found in model")
+            return None
+    elif component_name == "norm":
+        if hasattr(model, "norm"):
+            return "norm"
+        elif hasattr(model, "norms"):
+            return "norms"
+        elif hasattr(model, "layernorm"):
+            return "layernorm"
+        else:
+            logger.warning("No norm found in model")
+            return None
+    elif component_name == "lm_head":
+        if hasattr(model, "lm_head"):
+            return "lm_head"
+        else:
+            logger.warning("No lm_head found in model")
+            return None

flame/tools/__init__.py ADDED Viewed

File without changes

flame/tools/utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torch import nn
+from torchtitan.tools.logging import logger
+def get_nparams_and_flops(model: nn.Module, model_config, seq_len: int) -> tuple[int, int]:
+    nparams = sum(p.numel() for p in model.parameters())
+    nparams_embedding = sum(
+        sum(p.numel() for p in m.parameters())
+        for m in model.children()
+        if isinstance(m, nn.Embedding)
+    )
+    if hasattr(model_config, "num_heads"):
+        num_heads = model_config.num_heads
+    elif hasattr(model_config, "num_attention_heads"):
+        num_heads = model_config.num_attention_heads
+    else:
+        num_heads = 1
+        logger.warning("num_heads not found in model_config, defaulting to 1. ")
+    l, h, q, t = (
+        model_config.num_hidden_layers,
+        num_heads,
+        model_config.hidden_size // num_heads,
+        seq_len,
+    )
+    # Reasoning behind the factor of 12 for the self-attention part of the formula:
+    # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
+    # 2. the flash attention does 1 more matmul recomputation in the backward
+    #    but recomputation should not be counted in calculating MFU           (+0)
+    # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
+    # 4. we follow the convention and do not account for sparsity in causal attention
+    num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
+    return nparams, num_flops_per_token

flame/utils/checkpoint.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import glob
+import re
+import shutil
+from torchtitan.tools.logging import logger
+def cleanup_local_checkpoints(checkpoint_dir: str, keep_latest_k: int):
+    """Removes older checkpoint directories locally, keeping only the latest k for both DCP and HF formats."""
+    if keep_latest_k <= 0:
+        return # Keep all checkpoints
+    logger.info(f"Cleaning up local checkpoints in {checkpoint_dir}, keeping latest {keep_latest_k}")
+    # Cleanup DCP checkpoints (step-*)
+    dcp_checkpoints = sorted(
+        glob.glob(os.path.join(checkpoint_dir, "step-*")),
+        key=lambda x: int(re.search(r"step-(\d+)", os.path.basename(x)).group(1)) if re.search(r"step-(\d+)", os.path.basename(x)) and not x.endswith("-hf") else -1,
+        reverse=True
+    )
+    # Filter out HF format directories
+    dcp_checkpoints = [d for d in dcp_checkpoints if not d.endswith("-hf")]
+    if len(dcp_checkpoints) > keep_latest_k:
+        checkpoints_to_delete = dcp_checkpoints[keep_latest_k:]
+        logger.info(f"Deleting {len(checkpoints_to_delete)} old DCP checkpoints: {[os.path.basename(c) for c in checkpoints_to_delete]}")
+        for ckpt_path in checkpoints_to_delete:
+            if os.path.isdir(ckpt_path): # Ensure it's a directory
+                 try:
+                     shutil.rmtree(ckpt_path)
+                 except OSError as e:
+                     logger.error(f"Error removing directory {ckpt_path}: {e}")
+    # Cleanup HF checkpoints (step-*-hf)
+    hf_checkpoints = sorted(
+        glob.glob(os.path.join(checkpoint_dir, "step-*-hf")),
+         key=lambda x: int(re.search(r"step-(\d+)-hf", os.path.basename(x)).group(1)) if re.search(r"step-(\d+)-hf", os.path.basename(x)) else -1,
+        reverse=True
+    )
+    if len(hf_checkpoints) > keep_latest_k:
+        checkpoints_to_delete = hf_checkpoints[keep_latest_k:]
+        logger.info(f"Deleting {len(checkpoints_to_delete)} old HF checkpoints: {[os.path.basename(c) for c in checkpoints_to_delete]}")
+        for ckpt_path in checkpoints_to_delete:
+             if os.path.isdir(ckpt_path): # Ensure it's a directory
+                 try:
+                     shutil.rmtree(ckpt_path)
+                 except OSError as e:
+                     logger.error(f"Error removing directory {ckpt_path}: {e}")

flame/utils/convert_hf_to_dcp.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import argparse
+from pathlib import Path
+import torch
+import torch.distributed.checkpoint as DCP
+from transformers import AutoModelForCausalLM
+import fla  # noqa
+from torchtitan.tools.logging import init_logger, logger
+@torch.inference_mode()
+def convert_hf_weights(model: str, checkpoint: str):
+    logger.info(f"Loading model from {model}")
+    model = AutoModelForCausalLM.from_pretrained(model)
+    state_dict = model.state_dict()
+    logger.info(f"Writing to DCP at '{checkpoint}'")
+    checkpoint.mkdir(parents=True, exist_ok=True)
+    storage_writer = DCP.filesystem.FileSystemWriter(checkpoint, thread_count=8)
+    DCP.save({"model": state_dict}, storage_writer=storage_writer)
+if __name__ == "__main__":
+    init_logger()
+    parser = argparse.ArgumentParser(description="Convert huggingface-style model weights to DCP format.")
+    parser.add_argument("--model", type=str, required=True)
+    parser.add_argument("--checkpoint", type=Path, required=True)
+    args = parser.parse_args()
+    convert_hf_weights(args.model, args.checkpoint)

flame/utils/hf_utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import re
+from huggingface_hub import HfApi, HfFolder, logging as hf_logging, create_repo
+from torchtitan.tools.logging import logger
+def upload_checkpoint_to_hf(
+    local_path: str,
+    step: int,
+    hf_repo_id_for_run: str,
+    hf_keep_latest_k: int,
+    upload_format: str
+):
+    """Uploads a checkpoint directory to HF Hub and manages retention."""
+    if not os.path.isdir(local_path):
+        logger.error(f"Local path for upload does not exist or is not a directory: {local_path}")
+        return
+    api = HfApi()
+    token = HfFolder.get_token()
+    if not token:
+        logger.warning("Hugging Face Hub token not found. Skipping upload. Login via `huggingface-cli login` or set HF_TOKEN.")
+        return
+    # --- Ensure the specific repository for this run exists ---
+    try:
+        logger.info(f"Ensuring repository {hf_repo_id_for_run} exists...")
+        # Use create_repo which handles creation only if it doesn't exist
+        create_repo(repo_id=hf_repo_id_for_run, token=token, repo_type="model", exist_ok=True)
+        logger.info(f"Repository {hf_repo_id_for_run} ensured.")
+    except Exception as e:
+        logger.error(f"Failed to create or ensure repository {hf_repo_id_for_run}: {e}", exc_info=True)
+        return # Stop if repo interaction fails
+    commit_message = f"Upload {upload_format.upper()} checkpoint step {step}"
+    path_in_repo = f"step-{step}"
+    logger.info(f"Uploading {local_path} to {hf_repo_id_for_run}/{path_in_repo} on Hugging Face Hub...")
+    try:
+        api.upload_folder(
+            folder_path=local_path,
+            path_in_repo=path_in_repo,
+            repo_id=hf_repo_id_for_run,
+            repo_type="model",
+            commit_message=commit_message,
+            token=token,
+        )
+        logger.info(f"Successfully uploaded step {step} to {hf_repo_id_for_run}.")
+    except Exception as e:
+        logger.error(f"Failed to upload checkpoint step {step} to {hf_repo_id_for_run}: {e}", exc_info=True)
+    if hf_keep_latest_k > 0:
+        logger.info(f"Cleaning up old checkpoints on {hf_repo_id_for_run}, keeping latest {hf_keep_latest_k}")
+        try:
+            repo_files = api.list_repo_tree(hf_repo_id_for_run, repo_type="model", token=token, recursive=False)
+            step_folders = [
+                item.path for item in repo_files
+                if item.path.startswith("step-") and item.path[5:].isdigit()
+            ]
+            step_folders.sort(key=lambda x: int(x.split('-')[1]), reverse=True)
+            if len(step_folders) > hf_keep_latest_k:
+                folders_to_delete = step_folders[hf_keep_latest_k:]
+                logger.info(f"Found {len(step_folders)} checkpoints on Hub. Deleting {len(folders_to_delete)} older ones: {folders_to_delete}")
+                for folder in folders_to_delete:
+                    # Deleting requires repo_id, path_in_repo, and token
+                    api.delete_folder(
+                        repo_id=hf_repo_id_for_run,
+                        path_in_repo=folder,
+                        repo_type="model",
+                        commit_message=f"Delete old checkpoint {folder}",
+                        token=token
+                    )
+                logger.info("Hub cleanup complete.")
+            else:
+                logger.info("No old checkpoints found on Hub to delete.")
+        except Exception as e:
+            logger.error(f"Error during Hub checkpoint cleanup for {hf_repo_id_for_run}: {e}", exc_info=True)

torchtitan/components/__pycache__/dataloader.cpython-312.pyc ADDED Viewed

Binary file (3.78 kB). View file

torchtitan/components/__pycache__/ft.cpython-312.pyc ADDED Viewed

Binary file (6.75 kB). View file

torchtitan/components/__pycache__/loss.cpython-312.pyc ADDED Viewed

Binary file (1.5 kB). View file

torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc ADDED Viewed

Binary file (7.71 kB). View file

torchtitan/components/__pycache__/tokenizer.cpython-312.pyc ADDED Viewed

Binary file (1.09 kB). View file

torchtitan/datasets/__pycache__/hf_datasets.cpython-312.pyc ADDED Viewed

Binary file (7.03 kB). View file

torchtitan/datasets/tokenizer/__pycache__/tiktoken.cpython-312.pyc ADDED Viewed

Binary file (7.73 kB). View file

torchtitan/datasets/tokenizer/tiktoken.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+import os
+from collections.abc import Collection, Iterator, Sequence, Set as AbstractSet
+from pathlib import Path
+from typing import cast, Literal
+import tiktoken
+from tiktoken.load import load_tiktoken_bpe
+from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.config_manager import JobConfig
+from torchtitan.tools.logging import logger
+class TikTokenizer(Tokenizer):
+    """
+    Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
+    Args:
+        model_path (str): The path to the Tiktoken model file.
+    """
+    special_tokens: dict[str, int]
+    num_reserved_special_tokens = 256
+    pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E501, B950
+    def __init__(self, model_path: str):
+        super().__init__()
+        assert os.path.exists(
+            model_path
+        ), f"The tokenizer path does not exist: {model_path}"
+        assert os.path.isfile(model_path), model_path
+        mergeable_ranks = load_tiktoken_bpe(model_path)
+        num_base_tokens = len(mergeable_ranks)
+        special_tokens = [
+            "<|begin_of_text|>",
+            "<|end_of_text|>",
+            "<|reserved_special_token_0|>",
+            "<|reserved_special_token_1|>",
+            "<|reserved_special_token_2|>",
+            "<|reserved_special_token_3|>",
+            "<|start_header_id|>",
+            "<|end_header_id|>",
+            "<|reserved_special_token_4|>",
+            "<|eot_id|>",  # end of turn
+        ] + [
+            f"<|reserved_special_token_{i}|>"
+            for i in range(5, self.num_reserved_special_tokens - 5)
+        ]
+        self.special_tokens = {
+            token: num_base_tokens + i for i, token in enumerate(special_tokens)
+        }
+        self.model = tiktoken.Encoding(
+            name=Path(model_path).name,
+            pat_str=self.pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self._n_words: int = self.model.n_vocab
+        # BOS / EOS token IDs
+        self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
+        self.eos_id: int = self.special_tokens["<|end_of_text|>"]
+        self.pad_id: int = -1
+        self.stop_tokens = {
+            self.special_tokens["<|end_of_text|>"],
+            self.special_tokens["<|eot_id|>"],
+        }
+        logger.info(
+            f"TikTokenizer built: #words {self.n_words}, BOS ID {self.bos_id}, EOS ID {self.eos_id}"
+        )
+    def encode(
+        self,
+        s: str,
+        *,
+        bos: bool,
+        eos: bool,
+        allowed_special: Literal["all"] | AbstractSet[str] | None = None,
+        disallowed_special: Literal["all"] | Collection[str] | None = None,
+    ) -> list[int]:
+        """
+        Encodes a string into a list of token IDs.
+        Args:
+            s (str): The input string to be encoded.
+            bos (bool): Whether to prepend the beginning-of-sequence token.
+            eos (bool): Whether to append the end-of-sequence token.
+            allowed_tokens ("all"|set[str]): allowed special tokens in string
+            disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string
+        Returns:
+            list[int]: A list of token IDs.
+        By default, setting disallowed_special=() encodes a string by ignoring
+        special tokens. Specifically:
+        - Setting `disallowed_special` to () will cause all text corresponding
+          to special tokens to be encoded as natural text (insteading of raising
+          an error).
+        - Setting `allowed_special` to "all" will treat all text corresponding
+          to special tokens to be encoded as special tokens.
+        """
+        assert type(s) is str
+        allowed_special = allowed_special or set()
+        disallowed_special = disallowed_special or ()
+        # The tiktoken tokenizer can handle <=400k chars without
+        # pyo3_runtime.PanicException.
+        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
+        # https://github.com/openai/tiktoken/issues/195
+        # Here we iterate over subsequences and split if we exceed the limit
+        # of max consecutive non-whitespace or whitespace characters.
+        MAX_NO_WHITESPACES_CHARS = 25_000
+        substrs = (
+            substr
+            for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
+            for substr in self._split_whitespaces_or_nonwhitespaces(
+                s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
+            )
+        )
+        t: list[int] = []
+        for substr in substrs:
+            t.extend(
+                self.model.encode(
+                    substr,
+                    allowed_special=allowed_special,
+                    disallowed_special=disallowed_special,
+                )
+            )
+        if bos:
+            t.insert(0, self.bos_id)
+        if eos:
+            t.append(self.eos_id)
+        return t
+    def decode(self, t: Sequence[int]) -> str:
+        """
+        Decodes a list of token IDs into a string.
+        Args:
+            t (List[int]): The list of token IDs to be decoded.
+        Returns:
+            str: The decoded string.
+        """
+        # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
+        return self.model.decode(cast(list[int], t))
+    @staticmethod
+    def _split_whitespaces_or_nonwhitespaces(
+        s: str, max_consecutive_slice_len: int
+    ) -> Iterator[str]:
+        """
+        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+        consecutive whitespaces or consecutive non-whitespaces.
+        """
+        current_slice_len = 0
+        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+        slice_start = 0
+        for i in range(len(s)):
+            is_now_space = s[i].isspace()
+            if current_slice_is_space ^ is_now_space:
+                current_slice_len = 1
+                current_slice_is_space = is_now_space
+            else:
+                current_slice_len += 1
+                if current_slice_len > max_consecutive_slice_len:
+                    yield s[slice_start:i]
+                    slice_start = i
+                    current_slice_len = 1
+        yield s[slice_start:]
+def build_tiktoken_tokenizer(job_config: JobConfig) -> TikTokenizer:
+    return TikTokenizer(job_config.model.tokenizer_path)

torchtitan/distributed/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (251 Bytes). View file

torchtitan/distributed/__pycache__/parallel_dims.cpython-312.pyc ADDED Viewed

Binary file (6.11 kB). View file

torchtitan/distributed/__pycache__/pipeline.cpython-312.pyc ADDED Viewed

Binary file (7.82 kB). View file