zaydzuhri commited on Sep 2, 2025

Commit

8cb4047

verified ·

1 Parent(s): 683df89

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fla/models/lightnet/__pycache__/modeling_lightnet.cpython-312.pyc +0 -0
fla/models/retnet/__pycache__/configuration_retnet.cpython-312.pyc +0 -0
flame/__pycache__/config_manager.cpython-312.pyc +0 -0
flame/__pycache__/train.cpython-312.pyc +0 -0
flame/components/__init__.py +0 -0
flame/components/__pycache__/__init__.cpython-312.pyc +0 -0
flame/components/__pycache__/checkpoint.cpython-312.pyc +0 -0
flame/components/checkpoint.py +59 -0
flame/models/__init__.py +0 -0
flame/models/__pycache__/__init__.cpython-312.pyc +0 -0
flame/models/__pycache__/parallelize_fla.cpython-312.pyc +0 -0
flame/models/__pycache__/pipeline_fla.cpython-312.pyc +0 -0
flame/models/activation_offloading.py +447 -0
flame/models/fla.toml +67 -0
flame/models/parallelize_fla.py +550 -0
flame/models/pipeline_fla.py +162 -0
flame/tools/__init__.py +0 -0
flame/tools/__pycache__/__init__.cpython-312.pyc +0 -0
flame/tools/__pycache__/utils.cpython-312.pyc +0 -0
flame/tools/utils.py +41 -0
flame/utils/__init__.py +0 -0
flame/utils/__pycache__/__init__.cpython-312.pyc +0 -0
flame/utils/__pycache__/checkpoint.cpython-312.pyc +0 -0
flame/utils/__pycache__/convert_dcp_to_hf.cpython-312.pyc +0 -0
flame/utils/__pycache__/convert_hf_to_dcp.cpython-312.pyc +0 -0
flame/utils/__pycache__/hf_utils.cpython-312.pyc +0 -0
flame/utils/checkpoint.py +50 -0
flame/utils/convert_dcp_to_hf.py +66 -0
flame/utils/convert_hf_to_dcp.py +34 -0
flame/utils/hf_utils.py +77 -0
logs/none_ewbp5xc1/attempt_0/1/stderr.log +0 -0
profile_trace/iteration_1024/rank0_trace.json +0 -0
profile_trace/iteration_1024/rank1_trace.json +0 -0
profile_trace/iteration_1024/rank5_trace.json +0 -0
profile_trace/iteration_1024/rank6_trace.json +0 -0
profile_trace/iteration_1024/rank7_trace.json +0 -0
profile_trace/iteration_1536/rank4_trace.json +0 -0
profile_trace/iteration_20992/rank5_trace.json +0 -0
profile_trace/iteration_23552/rank6_trace.json +0 -0
profile_trace/iteration_2560/rank5_trace.json +0 -0
profile_trace/iteration_2560/rank7_trace.json +0 -0
profile_trace/iteration_29696/rank2_trace.json +0 -0
profile_trace/iteration_29696/rank6_trace.json +0 -0
profile_trace/iteration_30720/rank6_trace.json +0 -0
profile_trace/iteration_3584/rank0_trace.json +0 -0
profile_trace/iteration_3584/rank4_trace.json +0 -0
profile_trace/iteration_3584/rank5_trace.json +0 -0
profile_trace/iteration_3584/rank7_trace.json +0 -0
tb/20250901-0749/wandb/run-20250901_074914-top_transformer-top.code.1B.batch16.seqlen4096.context4096.warmup400.update1.steps40000.lr5e-5.cosine-202509010747/files/wandb-metadata.json +146 -0
tb/20250901-0749/wandb/run-20250901_074914-top_transformer-top.code.1B.batch16.seqlen4096.context4096.warmup400.update1.steps40000.lr5e-5.cosine-202509010747/logs/debug-internal.log +17 -0

fla/models/lightnet/__pycache__/modeling_lightnet.cpython-312.pyc ADDED Viewed

Binary file (18.4 kB). View file

fla/models/retnet/__pycache__/configuration_retnet.cpython-312.pyc ADDED Viewed

Binary file (3.76 kB). View file

flame/__pycache__/config_manager.cpython-312.pyc ADDED Viewed

Binary file (36.9 kB). View file

flame/__pycache__/train.cpython-312.pyc ADDED Viewed

Binary file (38.1 kB). View file

flame/components/__init__.py ADDED Viewed

File without changes

flame/components/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (141 Bytes). View file

flame/components/__pycache__/checkpoint.cpython-312.pyc ADDED Viewed

Binary file (3.21 kB). View file

flame/components/checkpoint.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass, field
+from datetime import timedelta
+from io import BytesIO
+from typing import Any, Dict, List
+import torch
+from torch.distributed.checkpoint.stateful import Stateful
+@dataclass
+class TrainState(Stateful):
+    step: int = 0
+    skipped_step: int = 0
+    token: int = 0
+    elapsed: timedelta = timedelta(0)
+    global_avg_losses: List[float] = field(default_factory=list)
+    global_max_losses: List[float] = field(default_factory=list)
+    log_steps: List[int] = field(default_factory=list)
+    def state_dict(self) -> Dict[str, Any]:
+        # Only checkpoint global_avg_losses and global_max_losses per log frequency
+        # to avoid sync overhead in every iteration.
+        global_avg_losses_bytes = BytesIO()
+        torch.save(self.global_avg_losses, global_avg_losses_bytes)
+        global_max_losses_bytes = BytesIO()
+        torch.save(self.global_max_losses, global_max_losses_bytes)
+        log_steps_bytes = BytesIO()
+        torch.save(self.log_steps, log_steps_bytes)
+        return {
+            "step": torch.tensor(self.step, dtype=torch.int32),
+            "skipped_step": torch.tensor(self.skipped_step, dtype=torch.int32),
+            "token": torch.tensor(self.token, dtype=torch.int64),
+            "elapsed": self.elapsed,
+            "global_avg_losses": global_avg_losses_bytes,
+            "global_max_losses": global_max_losses_bytes,
+            "log_steps": log_steps_bytes,
+        }
+    def load_state_dict(self, state_dict) -> None:
+        self.step = state_dict["step"].item()
+        self.skipped_step = state_dict.get("skipped_step", 0).item()
+        self.token = state_dict["token"].item()
+        self.elapsed = state_dict["elapsed"]
+        state_dict["global_avg_losses"].seek(0)
+        self.global_avg_losses = torch.load(
+            state_dict["global_avg_losses"], weights_only=False
+        )
+        state_dict["global_max_losses"].seek(0)
+        self.global_max_losses = torch.load(
+            state_dict["global_max_losses"], weights_only=False
+        )
+        state_dict["log_steps"].seek(0)
+        self.log_steps = torch.load(state_dict["log_steps"], weights_only=False)

flame/models/__init__.py ADDED Viewed

File without changes

flame/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (137 Bytes). View file

flame/models/__pycache__/parallelize_fla.cpython-312.pyc ADDED Viewed

Binary file (22.1 kB). View file

flame/models/__pycache__/pipeline_fla.cpython-312.pyc ADDED Viewed

Binary file (5.75 kB). View file

flame/models/activation_offloading.py ADDED Viewed

	@@ -0,0 +1,447 @@

+# Adapted from https://github.com/pytorch/torchtune/blob/main/torchtune/training/_activation_offloading.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import contextlib
+from typing import Union
+from warnings import warn
+import psutil
+import torch
+from torch import nn
+from torch.autograd.graph import saved_tensors_hooks
+from torchtitan.tools.logging import logger
+try:
+    import torchao
+    from torchao.dtypes.nf4tensor import NF4Tensor
+except ImportError:
+    torchao = None
+    NF4Tensor = None
+    logger.warning("torchao not found. ")
+# from torchtune.modules import TiedLinear
+class OffloadActivations(saved_tensors_hooks):
+    """Context manager under which activation tensors created in the forward pass will be offloaded.
+    Enable the memory efficiency technique of activation offloading, where activations bigger than
+    min_offload_size bytes will be offloaded to CPU in the forward and brought back in the backward.
+    This is in contrast to maintaining the activation on GPU VRAM throughout the program.
+    This manager contains the option of using one additional CUDA stream to handle the communication
+    between CUDA and CPU, which is intended to overlap with the default computation stream to improve
+    runtime. We designed synchronization with a few heuristics for optimizing the tradeoff between
+    runtime vs memory usage.
+    Args:
+        use_pin_memory (bool): Whether or not the offloaded Tensor will be placed in pinned
+            memory on the CPU. Pinned memory allows the Tensor to be moved back onto GPU more quickly
+            but is a limited resource. Default: True.
+        use_streams (bool): Whether or not to use streams for performance optimization where
+            the communications get overlapped with the computation. Requires a torch build
+            after torch-2.5.0.]. Default: True.
+        max_fwd_stash_size (int): The maximum size of the forward stash, or the maximum number of
+            consecutive activations to keep alive during the forward pass. This number must be at
+            least 1. Keeping alive more activations will potentially allow more overlap between the
+            communication and compute streams at the cost of increasing memory usage. Keeping alive
+            fewer activations will conserve memory, but may cause poor overlap between the streams,
+            increasing runtime. Default: 5.
+        min_offload_size (int): The minimum number of bytes a Tensor must be in order to qualify
+            for offloading. If the tensor is too small, we do not want to waste bandwidth and resources
+            moving it to CPU and back. Default: 1024 bytes.
+    Raises:
+        ValueError: if max_fwd_stash_size is not at least 1.
+    Example:
+        >>> with OffloadActivations():
+        >>>     logits = model(inputs)
+        >>> loss = ...
+        >>> loss.backward()
+    """
+    def __init__(
+        self,
+        use_pin_memory: bool = True,
+        use_streams: bool = True,
+        max_fwd_stash_size: int = 5,
+        min_offload_size: int = 1024,
+    ) -> None:
+        self.use_streams: bool = use_streams
+        self.min_tensor_size_bytes = (
+            min_offload_size  # we don't want to bother with small tensors
+        )
+        self.tracker = (
+            {}
+        )  # tensor_id => (new_tensor, if_modified)  ---> track what saved/offloaded tensors are where
+        self.tensor_id: int = 0
+        self.is_first_forward_call = True
+        self.is_first_backward_call = True
+        self.is_first_forward_pass = True
+        # managing cpu memory
+        self.use_pin_memory: bool = use_pin_memory
+        self.virtual_memory_safe_pct = (
+            60  # we should not exceed this percentage of memory
+        )
+        self.s0 = torch.cuda.default_stream()  # comp stream
+        # for streaming
+        if self.use_streams:
+            self.s1 = torch.cuda.Stream()  # comms stream
+            self.fwd_stash = {}  # tensor_id => (activation, ev1)
+            if max_fwd_stash_size < 1:
+                raise ValueError(
+                    f"max_fwd_stash_size should be at least 1 but is {max_fwd_stash_size}"
+                )
+            self.max_fwd_stash_size = max_fwd_stash_size
+            self.bwd_tensor_stash = {}  # tensor_id => activation
+            self.bwd_ev_stash = {}  # tensor_id => ev0
+            self.curr_graph_id = None
+            self.curr_autograd_node = None
+        # -------- platform util functions -------- #
+        def verify_sufficient_virtual_memory():
+            curr_pct = get_cpu_ram_pct()
+            if curr_pct > self.virtual_memory_safe_pct:
+                warn(
+                    f"***** WARNING: {curr_pct=}% > {self.virtual_memory_safe_pct=}% of virtual memory used"
+                )
+        def get_cpu_ram_pct() -> float:
+            # get the percentage of memory used by the system
+            return psutil.virtual_memory().percent
+        def get_tensor_id() -> int:
+            # create a unique id for each tensor we are managing
+            self.tensor_id += 1
+            return self.tensor_id
+        def get_num_bytes_tensor(x: torch.Tensor) -> int:
+            # get the number of bytes in a tensor, for memory management purposes
+            return (
+                x.element_size() * x.nelement()
+            )  # x.element_size() * x._base_storage().nbytes()
+        # -------- core pack / unpack work -------- #
+        def pack_tensor(activation: torch.Tensor) -> int:
+            # activations are passed in during forward pass - from here we take over and return a unique id
+            if self.is_first_forward_call:
+                assert (
+                    len(self.tracker) == 0
+                ), "backward pass should have cleared tracker of all tensors"
+                # set training phase trackers
+                self.is_first_forward_call = False
+                self.is_first_backward_call = True
+            # query for basic tensor info
+            num_bytes = get_num_bytes_tensor(activation)
+            tensor_id = get_tensor_id()
+            # only offload hefty bois if they're activations on CUDA (our heuristic
+            # for that is to check if they're not params or buffers)!
+            if (
+                activation.is_cuda
+                and num_bytes >= self.min_tensor_size_bytes
+                and (
+                    not isinstance(activation, torch.nn.Parameter)
+                    and not isinstance(activation, torch.nn.Buffer)
+                )
+            ):
+                if self.use_streams:
+                    # First, sync back and dereference previously offloaded tensors
+                    # as the offloading should be done sufficiently long ago.
+                    for id in [k for k in self.fwd_stash.keys()]:
+                        if id <= tensor_id - self.max_fwd_stash_size:
+                            _, ev = self.fwd_stash[id]
+                            self.s0.wait_event(ev)
+                            del self.fwd_stash[id]
+                        else:
+                            break
+                    # Sync in, offload, and add an event to sync back later
+                    self.s1.wait_stream(self.s0)
+                stream = self.s1 if self.use_streams else self.s0
+                with torch.cuda.stream(stream):
+                    try:
+                        cpu_tensor = torch.empty_like(
+                            activation, pin_memory=self.use_pin_memory, device="cpu"
+                        )
+                    except NotImplementedError as e:
+                        if (
+                            isinstance(activation, NF4Tensor)
+                            and torchao.__version__ < "0.6.0.dev20240917"
+                        ):
+                            raise RuntimeError(
+                                "Offloading NF4Tensors requires torchao-0.6.0.dev20240917 or later"
+                            ) from e
+                        raise e
+                    cpu_tensor.copy_(activation, non_blocking=True)
+                    self.tracker[tensor_id] = (
+                        cpu_tensor,
+                        True,
+                    )  # True = (in future) modified
+                if self.use_streams:
+                    event = self.s1.record_event()
+                    # Stash to keep activation alive til s1 is done
+                    self.fwd_stash[tensor_id] = (activation, event)
+            else:
+                self.tracker[tensor_id] = (
+                    activation,
+                    False,
+                )  # False = not modified, tensor is as is
+            return tensor_id
+        def unpack_tensor_single_stream(unpack_tensor_id: int) -> torch.Tensor:
+            # backward pass - we are called with the tensor_id, which
+            # we will use to retrieve the saved/offloaded tensor
+            if self.is_first_backward_call:
+                if self.is_first_forward_pass:
+                    self.is_first_forward_pass = False
+                    if self.use_pin_memory:
+                        verify_sufficient_virtual_memory()
+                self.is_first_backward_call = False
+                self.is_first_forward_call = True
+            assert (
+                unpack_tensor_id in self.tracker
+            ), f"untracked tensor with id {unpack_tensor_id}"
+            maybe_gpu_tensor, modified = self.tracker[unpack_tensor_id]
+            if modified:
+                gpu_tensor = maybe_gpu_tensor.to("cuda", non_blocking=True)
+                maybe_gpu_tensor = gpu_tensor
+            # clear tensor from tracking
+            del self.tracker[unpack_tensor_id]
+            return maybe_gpu_tensor
+        def unpack_tensor_with_streams(unpack_tensor_id: int) -> torch.Tensor:
+            # backward pass - we are called with the tensor_id, which
+            # we will use to retrieve the saved/offloaded tensor
+            if self.is_first_backward_call:
+                self.curr_graph_id = torch._C._current_graph_task_id()
+                def wait_and_del_remaining_references() -> None:
+                    for id in [k for k in self.bwd_tensor_stash.keys()]:
+                        event = self.bwd_ev_stash[id]
+                        self.s1.wait_event(event)
+                        del self.bwd_tensor_stash[id]
+                # Register a callback to the end of autograd to clean everything up
+                torch.autograd.variable.Variable._execution_engine.queue_callback(
+                    wait_and_del_remaining_references
+                )
+                if self.is_first_forward_pass:
+                    self.is_first_forward_pass = False
+                    if self.use_pin_memory:
+                        verify_sufficient_virtual_memory()
+                self.is_first_backward_call = False
+                self.is_first_forward_call = True
+            assert (
+                unpack_tensor_id in self.tracker
+            ), f"untracked tensor with id {unpack_tensor_id}"
+            maybe_gpu_tensor, modified = self.tracker[unpack_tensor_id]
+            if modified:
+                # Get data on the current autograd node
+                graph_id = torch._C._current_graph_task_id()
+                node = torch._C._current_autograd_node()
+                prev_node_ids = []
+                # If we're on a new node, mark prev node's tensors to be freed later
+                if graph_id == self.curr_graph_id and self.curr_autograd_node != node:
+                    self.curr_autograd_node = node
+                    prev_node_ids = [id for id in self.bwd_tensor_stash.keys()]
+                brought_back_from_cpu = True
+                if unpack_tensor_id in self.fwd_stash:
+                    maybe_gpu_tensor = self.fwd_stash[unpack_tensor_id][0]
+                    brought_back_from_cpu = False
+                else:
+                    # Kick off the process to bring tensors back
+                    with torch.cuda.stream(self.s1):
+                        gpu_tensor = maybe_gpu_tensor.to("cuda", non_blocking=True)
+                        maybe_gpu_tensor = gpu_tensor
+                    # Tell comp stream to wait for the info to be loaded before executing
+                    self.s0.wait_stream(self.s1)
+                    # Stash the tensor to keep memory alive until compute stream is complete
+                    self.bwd_tensor_stash[unpack_tensor_id] = maybe_gpu_tensor
+                    # Note: [Track views of the unpacked]
+                    # Why do we get the use count of the unpacked tensor here? We want an
+                    # initial count to compare to later, during the post-hook of the
+                    # backward node, when we need to decide whether we're allowed to free
+                    # the tensor yet. In what obscure cases must we delay freeing the
+                    # tensor (and thus call record_stream)?
+                    # 1. Any of the outputs of the backward node is a view of the unpacked
+                    #    tensor.
+                    # 2. In the case that this unpacked tensor will be used in a
+                    #    checkpointed region, if one of the recomputed saved tensors ends
+                    #    up as a view of the unpacked tensor.
+                    # 3. The user abuses the system somehow and manually relies on the
+                    #    unpacked tensor to exist after the backward node has executed.
+                    storage_refcount = torch._C._storage_Use_Count(
+                        maybe_gpu_tensor.untyped_storage()._cdata
+                    )
+                def hook(outputs, inputs):
+                    # create events for the current node inputs/outputs if they were streamed in
+                    if brought_back_from_cpu:
+                        # See Note: [Track views of the unpacked]
+                        # IF any of the outputs is a view of the tensor, OR if a view of
+                        # the tensor has been saved as a part of checkpoint's recompute
+                        # process, OR the user has abusedly incurred a reference on the
+                        # unpacked tensor, THEN the tensor might be used later and we
+                        # cannot presume to delete it after only the current node is
+                        # done! So we use our frenemy, record_stream, to ensure the
+                        # Tensor stays unmessed with until it's done getting used in the
+                        # compute stream (s0 here). Note that the con here is we introduce
+                        # non-deterministic (thus higher) memory usage, but this case
+                        # should not happen often.
+                        unpacked_tensor = self.bwd_tensor_stash[unpack_tensor_id]
+                        if (
+                            torch._C._storage_Use_Count(
+                                unpacked_tensor.untyped_storage()._cdata
+                            )
+                            > storage_refcount
+                        ):
+                            unpacked_tensor.record_stream(self.s0)
+                            del self.bwd_tensor_stash[unpack_tensor_id]
+                        else:
+                            event = self.s0.record_event()
+                            self.bwd_ev_stash[unpack_tensor_id] = event
+                    # if there are still things in the fwd_stash, get rid of them as we're in bwd now
+                    for id in [k for k in self.fwd_stash.keys()]:
+                        _, ev = self.fwd_stash[id]
+                        self.s0.wait_event(ev)
+                        del self.fwd_stash[id]
+                    # wait on prev node's events and del those
+                    for id in prev_node_ids:
+                        event = self.bwd_ev_stash[id]
+                        self.s1.wait_event(event)
+                        del self.bwd_tensor_stash[id]
+                    return outputs
+                node.register_hook(hook)
+            # clear tensor from tracking
+            del self.tracker[unpack_tensor_id]
+            return maybe_gpu_tensor
+        unpack_tensor = (
+            unpack_tensor_with_streams
+            if self.use_streams
+            else unpack_tensor_single_stream
+        )
+        super().__init__(pack_tensor, unpack_tensor)
+class NoOpManager(saved_tensors_hooks):
+    """
+    A saved_tensors_hook manager used to disable any other saved_tensors_hook manager
+    applied before. This relies on the behavior that only the most recently registered
+    saved_tensors_hook will run.
+    One example usage is to opt a local region of code out of activations offloading,
+    which is usually applied globally to best track state.
+    """
+    def __init__(self) -> None:
+        def noop(tensor):
+            return tensor
+        super().__init__(noop, noop)
+def get_act_offloading_ctx_manager(
+    model: nn.Module, enable_activation_offloading: bool
+) -> Union[OffloadActivations, contextlib.nullcontext]:
+    """Returns the activation offloading context manager for the model, which will be
+    a null context if enable_activation_offloading is False.
+    If activation offloading is enabled, we return the OffloadActivations context manager.
+    If activation offloading is disabled, we return a NoOpManager context manager.
+    Args:
+        model (nn.Module): the model to wrap with the activation offloading context manager.
+        enable_activation_offloading (bool): whether or not to enable activation offloading
+            for the model.
+    Returns:
+        contextlib.ContextDecorator: the activation offloading context manager for the model.
+    Raises:
+        NotImplementedError: If the model is a multimodal model and activation offloading is enabled.
+    """
+    if enable_activation_offloading:
+        activations_handling_ctx = OffloadActivations()
+        # Below is our hack to disable offloading the last output Linear in every
+        # step, as the cost for offloading the activation and then soon after bringing
+        # it back is expensive. Moreover, due to heuristics in our streaming API,
+        # we actually use more memory if we offload it as it interferes with chunkedCE.
+        output_head_detected = False
+        noop_ctx = NoOpManager()
+        if hasattr(model, "output"):
+            if isinstance(model.output, nn.Module):
+                model.output.register_forward_pre_hook(
+                    lambda *args: noop_ctx.__enter__()
+                )
+                model.output.register_forward_hook(
+                    lambda *args: noop_ctx.__exit__(), always_call=True
+                )
+                print("registering hooks for model.output ============  ")
+                output_head_detected = True
+            # ================================
+            # ! TODO[flame] check if we need to detal with TiedLinear
+            # The following code appears in `torchtune`
+            # elif isinstance(model.output, TiedLinear):
+            #     model.output.linear.register_forward_pre_hook(
+            #         lambda *args: noop_ctx.__enter__()
+            #     )
+            #     model.output.linear.register_forward_hook(
+            #         lambda *args: noop_ctx.__exit__(), always_call=True
+            #     )
+            #     output_head_detected = True
+        if not output_head_detected:
+            logger.warning(
+                "During activation offloading, no output head was detected. "
+                "If your model has an output head, it will be offloaded. "
+                "This usually greatly slows training, given the large vocabulary size. "
+                "To change this behavior, set your output head as model.output and make it "
+                "an nn.Module."
+            )
+    else:
+        activations_handling_ctx = contextlib.nullcontext()
+    return activations_handling_ctx

flame/models/fla.toml ADDED Viewed

	@@ -0,0 +1,67 @@

+[model]
+config = "fla-hub/transformer-1.3B-100B"
+tokenizer_path = "fla-hub/transformer-1.3B-100B"
+[job]
+dump_folder = "exp"
+print_args = true
+[training]
+batch_size = 32
+seq_len = 2048
+context_len = 2048
+gradient_accumulation_steps = 1
+steps = 20480
+max_norm = 1.0
+skip_nan_inf = true
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = -1
+tensor_parallel_degree = 1
+compile = false
+dataset = "HuggingFaceFW/fineweb-edu"
+dataset_name = "default"
+num_workers = 32
+pin_memory = false
+persistent_workers = false
+prefetch_factor = 2
+seed = 42
+varlen = false
+[optimizer]
+name = "AdamW"
+eps = 1e-15
+lr = 3e-4
+[lr_scheduler]
+warmup_steps = 1024
+decay_type = "cosine"
+lr_min = 0.1
+[checkpoint]
+enable_checkpoint = true
+folder = "checkpoint"
+interval_type = "steps"
+interval = 2048
+model_weights_only = false
+export_dtype = "float32"
+async_mode = "disabled"    # ["disabled", "async", "async_with_pinned_mem"]
+[profiling]
+enable_profiling = true
+save_traces_folder = "profile_trace"
+profile_freq = 512
+[metrics]
+log_freq = 32
+enable_wandb = true
+[experimental]
+context_parallel_degree = 1
+pipeline_parallel_degree = 1
+[float8]
+enable_fsdp_float8_all_gather = false
+precompute_float8_dynamic_scale_for_fsdp = false
+[activation_checkpoint]
+mode = "none"

flame/models/parallelize_fla.py ADDED Viewed

	@@ -0,0 +1,550 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This file applies the PT-D parallelisms (except pipeline parallelism) and various
+# training techniques (e.g. activation checkpointing and compile) to the Llama model.
+from collections import defaultdict
+import torch
+import torch.nn as nn
+from torch.distributed import DeviceMesh
+from torch.distributed._composable.fsdp import CPUOffloadPolicy, MixedPrecisionPolicy, fully_shard
+from torch.distributed._composable.replicate import replicate
+from torch.distributed._tensor import Replicate, Shard
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper as ptd_checkpoint_wrapper
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    PrepareModuleOutput,
+    RowwiseParallel,
+    SequenceParallel,
+    parallelize_module
+)
+from fla.modules.fused_linear_cross_entropy import LinearLossParallel
+from fla.modules.mlp import SwiGLULinearParallel
+from fla.modules.parallel import PrepareModuleWeight
+from torchtitan.config_manager import TORCH_DTYPE_MAP, JobConfig
+from torchtitan.distributed.parallel_dims import ParallelDims
+from torchtitan.tools.logging import logger
+def parallelize_fla(
+    model: nn.Module,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+    """
+    Apply tensor parallelism, activation checkpointing, torch.compile, and data
+    parallelism to the model.
+    NOTE: The passed-in model preferably should be on meta device. Otherwise,
+    the model must fit on GPU or CPU memory.
+    """
+    if parallel_dims.tp_enabled:
+        if (
+            job_config.experimental.enable_async_tensor_parallel
+            and not job_config.training.compile
+        ):
+            raise RuntimeError("Async TP requires --training.compile")
+        enable_float8_linear = "float8" in job_config.model.converters
+        apply_tp(
+            model,
+            world_mesh["tp"],
+            loss_parallel=parallel_dims.loss_parallel_enabled,
+            enable_float8=enable_float8_linear,
+            enable_async_tp=job_config.experimental.enable_async_tensor_parallel,
+        )
+    if job_config.activation_checkpoint.mode != "none":
+        apply_ac(model, job_config.activation_checkpoint)
+    # turn on per-block compile after AC wrapping and before FSDP
+    if job_config.training.compile:
+        apply_compile(model)
+    if (
+        parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled
+    ):  # apply FSDP or HSDP, potentially with Context Parallel
+        if parallel_dims.dp_replicate_enabled:
+            dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+        else:
+            dp_mesh_dim_names = ("dp_shard_cp",)
+        apply_fsdp(
+            model,
+            world_mesh[tuple(dp_mesh_dim_names)],
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
+            pp_enabled=parallel_dims.pp_enabled,
+            cpu_offload=job_config.training.enable_cpu_offload,
+            reshard_after_forward_policy=job_config.training.fsdp_reshard_after_forward,
+        )
+        if parallel_dims.dp_replicate_enabled:
+            logger.info("Applied HSDP to the model")
+        else:
+            logger.info("Applied FSDP to the model")
+        if parallel_dims.cp_enabled:
+            logger.info("Applied Context Parallel to the model")
+        if job_config.training.enable_cpu_offload:
+            logger.info("Applied CPU Offloading to the model")
+    elif parallel_dims.dp_replicate_enabled:
+        if world_mesh.ndim > 1:
+            raise RuntimeError("DDP has not supported > 1D parallelism")
+        apply_ddp(
+            model,
+            world_mesh,
+            enable_compile=job_config.training.compile,
+            enable_compiled_autograd=job_config.experimental.enable_compiled_autograd,
+        )
+class TPPlan:
+    def __init__(
+        self,
+        model=None,
+        loss_parallel=False,
+        enable_float8=False,
+    ):
+        self.model = model
+        self.loss_parallel = loss_parallel
+        self.enable_float8 = enable_float8
+        self.base_model_prefix = getattr(model, "base_model_prefix", "model")
+        # TODO(vkuzo): once float8 configuration supports delayed scaling,
+        # add a check here to enforce supported float8 all-gather configurations
+        # TODO(vkuzo): add the items below to __init__.py of torchao.float8 and import from there
+        try:
+            from torchao.float8.float8_tensor_parallel import (
+                Float8ColwiseParallel,
+                Float8RowwiseParallel,
+                PrepareFloat8ModuleInput
+            )
+        except ImportError:
+            Float8ColwiseParallel = None
+            Float8RowwiseParallel = None
+            PrepareFloat8ModuleInput = None
+        if self.enable_float8 and Float8ColwiseParallel is not None:
+            self.rowwise_parallel = Float8RowwiseParallel
+            self.colwise_parallel = Float8ColwiseParallel
+            self.prepare_module_input = PrepareFloat8ModuleInput
+            self.prepare_module_output = PrepareModuleOutput
+        else:
+            self.rowwise_parallel = RowwiseParallel
+            self.colwise_parallel = ColwiseParallel
+            self.prepare_module_input = PrepareModuleInput
+            self.prepare_module_output = PrepareModuleOutput
+    @property
+    def model_plan(self):
+        plans = {
+            f"{self.base_model_prefix}.embeddings": RowwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(1),
+            ),
+            f"{self.base_model_prefix}.norm": SequenceParallel(),
+        }
+        if self.loss_parallel:
+            plans.update(
+                {
+                    "lm_head": ColwiseParallel(
+                        input_layouts=Shard(1),
+                        output_layouts=Shard(-1) if self.loss_parallel else Replicate(),
+                        use_local_output=not self.loss_parallel,
+                    ),
+                }
+            )
+        else:
+            plans.update(
+                {
+                    "lm_head": PrepareModuleWeight(layouts=Replicate()),
+                    "criterion": LinearLossParallel(),
+                }
+            )
+        return plans
+    @property
+    def layer_plan(self):
+        return {
+            "attn_norm": SequenceParallel(),
+            **self.attn_plan,
+            "mlp_norm": SequenceParallel(),
+            **self.mlp_plan,
+        }
+    @property
+    def attn_plan(self):
+        raise NotImplementedError(
+            f"TP plans for token mixing layers of {self.model.config.model_type} not implemented"
+        )
+    @property
+    def mlp_plan(self):
+        return {
+            "mlp": self.prepare_module_input(
+                input_layouts=(Shard(1),),
+                desired_input_layouts=(Replicate(),),
+            ),
+            "mlp.gate_proj": self.colwise_parallel(),
+            "mlp.up_proj": self.colwise_parallel(),
+            "mlp.down_proj": self.rowwise_parallel(output_layouts=Shard(1)),
+            "mlp.swiglu_linear": SwiGLULinearParallel(output_layouts=Shard(1)),
+        }
+class TransformerTPPlan(TPPlan):
+    @property
+    def attn_plan(self):
+        return {
+            "attn": self.prepare_module_input(
+                input_kwarg_layouts={"hidden_states": Shard(1)},
+                desired_input_kwarg_layouts={"hidden_states": Replicate()},
+            ),
+            "attn.q_proj": self.colwise_parallel(),
+            "attn.k_proj": self.colwise_parallel(),
+            "attn.v_proj": self.colwise_parallel(),
+            "attn.o_proj": self.rowwise_parallel(output_layouts=Shard(1)),
+        }
+class GLATPPlan(TPPlan):
+    @property
+    def attn_plan(self):
+        return {
+            "attn": self.prepare_module_input(
+                input_kwarg_layouts={"hidden_states": Shard(1)},
+                desired_input_kwarg_layouts={"hidden_states": Replicate()},
+            ),
+            "attn.q_proj": self.colwise_parallel(),
+            "attn.k_proj": self.colwise_parallel(),
+            "attn.v_proj": self.colwise_parallel(),
+            "attn.g_proj": self.colwise_parallel(),
+            "attn.gk_proj.0": PrepareModuleWeight(layouts=Replicate()),
+            "attn.gk_proj.1": self.colwise_parallel(),
+            "attn.g_norm": SequenceParallel(sequence_dim=-1),
+            "attn.o_proj": self.rowwise_parallel(output_layouts=Shard(1)),
+        }
+TP_PLAN_MAP = {"transformer": TransformerTPPlan, "gla": GLATPPlan}
+def apply_tp(
+    model: nn.Module,
+    tp_mesh: DeviceMesh,
+    loss_parallel: bool,
+    enable_float8: bool,
+    enable_async_tp: bool,
+):
+    """Apply tensor parallelism."""
+    # 1. Parallelize the embedding and shard its outputs (which are the first
+    # transformer block's inputs)
+    # 2. Parallelize the root norm layer over the sequence dim
+    # 3. Parallelize the final linear output layer
+    tp_plan = TP_PLAN_MAP[model.config.model_type](
+        model, loss_parallel=loss_parallel, enable_float8=enable_float8
+    )
+    parallelize_module(model, tp_mesh, tp_plan.model_plan)
+    blocks = get_blocks(model)
+    if blocks is None:
+        logger.warning("No block found for tensor parallelism")
+    else:
+        for _, block in enumerate(blocks):
+            parallelize_module(
+                module=block,
+                device_mesh=tp_mesh,
+                parallelize_plan=tp_plan.layer_plan,
+            )
+    if enable_async_tp:
+        from torch.distributed._symmetric_memory import enable_symm_mem_for_group
+        torch._inductor.config._micro_pipeline_tp = True
+        enable_symm_mem_for_group(tp_mesh.get_group().group_name)
+    logger.info(
+        f"Applied {'Float8 ' if enable_float8 else ''}{'Async ' if enable_async_tp else ''}"
+        "Tensor Parallelism to the model"
+    )
+# for selective op activation checkpointing
+_save_list = {
+    torch.ops.aten.mm.default,
+    torch.ops.aten._scaled_dot_product_efficient_attention.default,
+    torch.ops.aten._scaled_dot_product_flash_attention.default,
+    torch.ops._c10d_functional.reduce_scatter_tensor.default,
+    # for low precision training, it's useful to always save
+    # the result of max, since the absolute maximum is
+    # used to compute the scaling factor for quantization.
+    torch.ops.aten.max.default,
+}
+def _apply_ac_to_block(module: nn.Module, ac_config):
+    valid_ac_modes = ("full", "selective")
+    if ac_config.mode not in valid_ac_modes:
+        raise ValueError(
+            f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}"
+        )
+    if ac_config.mode == "full":
+        return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+    assert ac_config.mode == "selective", f"{ac_config.mode}"
+    use_op_sac = ac_config.selective_ac_option == "op"
+    use_layer_sac = ac_config.selective_ac_option.isdigit()
+    if not use_op_sac and not use_layer_sac:
+        raise ValueError(
+            f"Invalid selective AC option: {ac_config.selective_ac_option}. "
+            f"Valid options: 'op' or a positive int representing layer frequency"
+        )
+    if use_op_sac:
+        from torch.utils.checkpoint import CheckpointPolicy, create_selective_checkpoint_contexts
+        def _get_custom_policy(meta):
+            def _custom_policy(ctx, func, *args, **kwargs):
+                mode = "recompute" if ctx.is_recompute else "forward"
+                mm_count_key = f"{mode}_mm_count"
+                if func == torch.ops.aten.mm.default:
+                    meta[mm_count_key] += 1
+                # Saves output of all compute ops, except every second mm
+                to_save = func in _save_list and not (
+                    func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0
+                )
+                return (
+                    CheckpointPolicy.MUST_SAVE
+                    if to_save
+                    else CheckpointPolicy.PREFER_RECOMPUTE
+                )
+            return _custom_policy
+        def selective_checkpointing_context_fn():
+            meta = defaultdict(int)
+            return create_selective_checkpoint_contexts(_get_custom_policy(meta))
+        return ptd_checkpoint_wrapper(
+            module,
+            context_fn=selective_checkpointing_context_fn,
+            preserve_rng_state=False,
+        )
+    elif use_layer_sac:
+        # Checkpoint every `ac_freq` of the modules passed to this function
+        ac_freq = int(ac_config.selective_ac_option)
+        ptd_checkpoint_wrapper.__dict__.setdefault("_count", 0)
+        ptd_checkpoint_wrapper._count += 1
+        if not ac_freq or ptd_checkpoint_wrapper._count % ac_freq == 0:
+            return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+        else:
+            return module
+def apply_ac(model: nn.Module, ac_config):
+    """Apply activation checkpointing to the model."""
+    blocks = get_blocks(model)
+    if blocks is None:
+        logger.warning("No block found for activation checkpointing")
+        return
+    for layer_id, block in blocks.named_children():
+        block = _apply_ac_to_block(block, ac_config)
+        blocks.register_module(layer_id, block)
+    logger.info(f"Applied {ac_config.mode} activation checkpointing to the model")
+def apply_compile(model: nn.Module):
+    """
+    Apply torch.compile to each block, which makes compilation efficient due to
+    repeated structure. Alternatively one can compile the whole model (after applying DP).
+    """
+    blocks = get_blocks(model)
+    if blocks is None:
+        logger.warning("No block found for torch.compile")
+    else:
+        for layer_id, block in blocks.named_children():
+            block = torch.compile(block)
+            blocks.register_module(layer_id, block)
+        logger.info("Compiling each block with torch.compile")
+    real_model = get_model(model)
+    logger.info("Compiling the embedding, norm, and lm_head layers with torch.compile")
+    embeddings_key = get_components_name(real_model, "tok_embeddings")
+    if embeddings_key is not None:
+        embeddings = torch.compile(getattr(real_model, embeddings_key), fullgraph=True)
+        real_model.register_module(embeddings_key, embeddings)
+    norm_key = get_components_name(real_model, "norm")
+    if norm_key is not None:
+        norm = torch.compile(getattr(real_model, norm_key), fullgraph=True)
+        real_model.register_module(norm_key, norm)
+    lm_head_key = get_components_name(model, "lm_head")
+    if lm_head_key is not None:
+        lm_head = torch.compile(getattr(model, lm_head_key), fullgraph=True)
+        model.register_module(lm_head_key, lm_head)
+    logger.info("Compiling the entire model with torch.compile")
+    model = torch.compile(model)
+def apply_fsdp(
+    model: nn.Module,
+    dp_mesh: DeviceMesh,
+    param_dtype: torch.dtype,
+    reduce_dtype: torch.dtype,
+    pp_enabled: bool,
+    cpu_offload: bool = False,
+    reshard_after_forward_policy: str = "default",
+):
+    """
+    Apply data parallelism (via FSDP2) to the model.
+    Args:
+        model (nn.Module): The model to apply data parallelism to.
+        dp_mesh (DeviceMesh): The device mesh to use for data parallelism.
+        param_dtype (torch.dtype): The data type to use for model parameters.
+        reduce_dtype (torch.dtype): The data type to use for reduction operations.
+        pp_enabled (bool): Whether pipeline parallelism is enabled.
+        cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False.
+        reshard_after_forward_policy (str, optional):
+            The policy to use for resharding after forward pass. Defaults to "default".
+            Other options: "never", "always".
+            - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios.
+            - "always" will enable `reshard_after_forward` for all forward passes.
+            - "never" will disable `reshard_after_forward` for all forward passes.
+    """
+    mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype)
+    fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy}
+    if cpu_offload:
+        fsdp_config["offload_policy"] = CPUOffloadPolicy()
+    blocks = get_blocks(model)
+    if blocks is None:
+        logger.warning("No block found for FSDP")
+    else:
+        total_blocks = len(blocks)
+        for layer_id, block in enumerate(blocks):
+            if reshard_after_forward_policy == "always":
+                reshard_after_forward = True
+            elif reshard_after_forward_policy == "never":
+                reshard_after_forward = False
+            elif reshard_after_forward_policy == "default":
+                if pp_enabled:
+                    # For PP, do not reshard after forward to avoid per-microbatch
+                    # all-gathers, which can be expensive and non-overlapped
+                    reshard_after_forward = False
+                else:
+                    # As an optimization, do not reshard after forward for the last
+                    # transformer block since FSDP would prefetch it immediately
+                    reshard_after_forward = int(layer_id) < total_blocks - 1
+            else:
+                raise ValueError(
+                    f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}."
+                )
+            fully_shard(
+                block,
+                **fsdp_config,
+                reshard_after_forward=reshard_after_forward,
+            )
+    fully_shard(model, **fsdp_config, reshard_after_forward=not pp_enabled)
+def apply_ddp(
+    model: nn.Module,
+    dp_mesh: DeviceMesh,
+    enable_compile: bool,
+    enable_compiled_autograd: bool,
+):
+    if enable_compile:
+        if enable_compiled_autograd:
+            torch._dynamo.config.optimize_ddp = (
+                "python_reducer_without_compiled_forward"
+            )
+        else:
+            torch._dynamo.config.optimize_ddp = "ddp_optimizer"
+    replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100)
+    logger.info("Applied DDP to the model")
+def get_model(model):
+    base_model_prefix = getattr(model, "base_model_prefix", "model")
+    if not hasattr(model, base_model_prefix):
+        return None
+    model = getattr(model, base_model_prefix)
+    return model
+def get_blocks(model):
+    # TODO[flame]: adapt for network not using 'layers' attribute
+    model = get_model(model)
+    if not hasattr(model, "layers"):
+        logger.warning('no "layers" in model can be found')
+        return None
+    return model.layers
+def get_components_name(model, component_name):
+    """
+    We try to catch tok_embeddings, norm layers and lm_head layers
+    We do not catch the layer names in the blocks, for blocks see `get_blocks`
+    We assume the model has the following structure:
+    LlamaForCausalLM:
+        Model:
+            embed_tokens,
+            layers,
+            norm,
+        lm_head
+    ***
+    so, to search 'tok_embeddings' and 'norm' we need to pass `get_model(model)`
+    and for 'lm_head' we need to pass `model`
+    ***
+    """
+    if component_name == "tok_embeddings":
+        if hasattr(model, "tok_embeddings"):
+            return "tok_embeddings"
+        elif hasattr(model, "embed_tokens"):
+            return "embed_tokens"
+        elif hasattr(model, "embeddings"):
+            return "embeddings"
+        else:
+            logger.warning("No tok_embeddings found in model")
+            return None
+    elif component_name == "norm":
+        if hasattr(model, "norm"):
+            return "norm"
+        elif hasattr(model, "norms"):
+            return "norms"
+        elif hasattr(model, "layernorm"):
+            return "layernorm"
+        else:
+            logger.warning("No norm found in model")
+            return None
+    elif component_name == "lm_head":
+        if hasattr(model, "lm_head"):
+            return "lm_head"
+        else:
+            logger.warning("No lm_head found in model")
+            return None

flame/models/pipeline_fla.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This file applies the PT-D pipeline parallelism to the Llama model.
+import copy
+from typing import Callable, Optional, Union
+import torch
+import torch.nn as nn
+from torch.distributed import DeviceMesh
+from torch.distributed.pipelining import PipelineStage
+from torch.distributed.pipelining.schedules import ScheduleZBVZeroBubble, _PipelineSchedule, get_schedule_class
+from transformers import PretrainedConfig
+from flame.models.parallelize_fla import get_blocks, get_components_name, get_model
+from torchtitan.config_manager import JobConfig
+from torchtitan.distributed.parallel_dims import ParallelDims
+from torchtitan.distributed.pipeline import build_pipeline_schedule, generate_split_points, stage_ids_this_rank
+from torchtitan.tools.logging import logger
+DeviceType = Union[int, str, torch.device]
+def pipeline_fla(
+    model: nn.Module,
+    pp_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+    device: DeviceType,
+    model_config: PretrainedConfig,
+    loss_fn: Callable[..., torch.Tensor],
+) -> tuple[_PipelineSchedule, list[nn.Module], bool, bool]:
+    stages, models = pipeline_fla_manual_split(
+        model, pp_mesh, parallel_dims, job_config, device, model_config
+    )
+    pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn)
+    # This is used in the train loop to determine whether to pass in the input_ids and labels
+    has_first_stage = False
+    has_last_stage = False
+    for stage in stages:
+        if stage.is_first:
+            has_first_stage = True
+        if stage.is_last:
+            has_last_stage = True
+    return pp_schedule, models, has_first_stage, has_last_stage
+def pipeline_fla_manual_split(
+    whole_model: nn.Module,
+    pp_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+    device: DeviceType,
+    model_config: PretrainedConfig,
+) -> tuple[list[PipelineStage], list[nn.Module]]:
+    """
+    This API extracts one torch.nn.Module objects for the part of the model configured to run inside this stage.
+    It wraps the model chunk in a ManualPipelineStage object and returns both the stage and model objects.
+    The stage object is used to create a pipeline schedule, and the model object can be used for applying SPMD
+    parallelism.
+    """
+    pp_rank = pp_mesh.get_local_rank()
+    pp_size = pp_mesh.size()
+    splits = (
+        job_config.experimental.pipeline_parallel_split_points
+        or generate_split_points(
+            job_config, parallel_dims.pp, model_config.num_hidden_layers
+        )
+    )
+    def _build_stage(
+        stage_idx: int,
+        start_layer: Optional[str],
+        stop_layer: Optional[str],
+        is_first: bool = False,
+        is_last: bool = False,
+    ) -> tuple[PipelineStage, nn.Module]:
+        model = copy.deepcopy(whole_model)
+        if not is_first:
+            # we do `model.tok_embeddings = None` here
+            real_model = get_model(model)
+            tok_embeddings_name = get_components_name(real_model, "tok_embeddings")
+            setattr(real_model, tok_embeddings_name, None)
+        drop_layers = start_layer is not None
+        # Get module dictionary from get_blocks(model)
+        # and Create a list of keys before modifying dictionary
+        module_dict = get_blocks(model)._modules  # Store reference
+        layer_names = list(module_dict.keys())
+        # Iterate over the list of keys instead of `_modules.items()`
+        for name in layer_names:
+            # Dynamically determine prefix (blocks.* or layers.*)
+            prefix = start_layer.split(".")[0] if start_layer else "layers"
+            layer_name = f"{prefix}.{name}"  # Construct the correct name format
+            # Ensure `drop_layers` activation is based on actual naming
+            if layer_name == start_layer:
+                drop_layers = False
+            if layer_name == stop_layer:
+                drop_layers = True
+            # Delete layer if drop_layers is active
+            if drop_layers:
+                del module_dict[name]  # Safe deletion from stored dictionary
+        if not is_last:
+            # we do `model.norm = None` and `model.output = None`
+            real_model = get_model(model)
+            norm_name = get_components_name(real_model, "norm")
+            setattr(real_model, norm_name, None)
+            head_name = get_components_name(model, "lm_head")
+            setattr(model, head_name, None)
+        stage = PipelineStage(
+            model,
+            stage_idx,
+            num_stages,
+            device,
+            group=pp_mesh.get_group("pp"),
+        )
+        return stage, model
+    num_stages = len(splits) + 1
+    stage_idx = pp_rank
+    stages = []
+    models = []
+    schedule_class = get_schedule_class(
+        job_config.experimental.pipeline_parallel_schedule
+    )
+    style = "v" if schedule_class == ScheduleZBVZeroBubble else "loop"
+    for stage_idx in stage_ids_this_rank(pp_rank, pp_size, num_stages, style=style):
+        start_layer = splits[stage_idx - 1] if stage_idx > 0 else None
+        stop_layer = splits[stage_idx] if stage_idx < num_stages - 1 else None
+        stage, model_chunk = _build_stage(
+            stage_idx,
+            start_layer,
+            stop_layer,
+            is_first=stage_idx == 0,
+            is_last=stage_idx == num_stages - 1,
+        )
+        logger.info(
+            f"PP rank {pp_rank} is building stage_idx {stage_idx}"
+            f" with start_layer {start_layer}, stop_layer {stop_layer}"
+        )
+        stages.append(stage)
+        models.append(model_chunk)
+    return stages, models

flame/tools/__init__.py ADDED Viewed

File without changes

flame/tools/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (136 Bytes). View file

flame/tools/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (2.14 kB). View file

flame/tools/utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torch import nn
+from torchtitan.tools.logging import logger
+def get_nparams_and_flops(model: nn.Module, model_config, seq_len: int) -> tuple[int, int]:
+    nparams = sum(p.numel() for p in model.parameters())
+    nparams_embedding = sum(
+        sum(p.numel() for p in m.parameters())
+        for m in model.children()
+        if isinstance(m, nn.Embedding)
+    )
+    if hasattr(model_config, "num_heads"):
+        num_heads = model_config.num_heads
+    elif hasattr(model_config, "num_attention_heads"):
+        num_heads = model_config.num_attention_heads
+    else:
+        num_heads = 1
+        logger.warning("num_heads not found in model_config, defaulting to 1. ")
+    l, h, q, t = (
+        model_config.num_hidden_layers,
+        num_heads,
+        model_config.hidden_size // num_heads,
+        seq_len,
+    )
+    # Reasoning behind the factor of 12 for the self-attention part of the formula:
+    # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
+    # 2. the flash attention does 1 more matmul recomputation in the backward
+    #    but recomputation should not be counted in calculating MFU           (+0)
+    # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
+    # 4. we follow the convention and do not account for sparsity in causal attention
+    num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
+    return nparams, num_flops_per_token

flame/utils/__init__.py ADDED Viewed

File without changes

flame/utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (136 Bytes). View file

flame/utils/__pycache__/checkpoint.cpython-312.pyc ADDED Viewed

Binary file (4.07 kB). View file

flame/utils/__pycache__/convert_dcp_to_hf.cpython-312.pyc ADDED Viewed

Binary file (3.73 kB). View file

flame/utils/__pycache__/convert_hf_to_dcp.cpython-312.pyc ADDED Viewed

Binary file (1.92 kB). View file

flame/utils/__pycache__/hf_utils.cpython-312.pyc ADDED Viewed

Binary file (4.46 kB). View file

flame/utils/checkpoint.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import glob
+import re
+import shutil
+from torchtitan.tools.logging import logger
+def cleanup_local_checkpoints(checkpoint_dir: str, keep_latest_k: int):
+    """Removes older checkpoint directories locally, keeping only the latest k for both DCP and HF formats."""
+    if keep_latest_k <= 0:
+        return # Keep all checkpoints
+    logger.info(f"Cleaning up local checkpoints in {checkpoint_dir}, keeping latest {keep_latest_k}")
+    # Cleanup DCP checkpoints (step-*)
+    dcp_checkpoints = sorted(
+        glob.glob(os.path.join(checkpoint_dir, "step-*")),
+        key=lambda x: int(re.search(r"step-(\d+)", os.path.basename(x)).group(1)) if re.search(r"step-(\d+)", os.path.basename(x)) and not x.endswith("-hf") else -1,
+        reverse=True
+    )
+    # Filter out HF format directories
+    dcp_checkpoints = [d for d in dcp_checkpoints if not d.endswith("-hf")]
+    if len(dcp_checkpoints) > keep_latest_k:
+        checkpoints_to_delete = dcp_checkpoints[keep_latest_k:]
+        logger.info(f"Deleting {len(checkpoints_to_delete)} old DCP checkpoints: {[os.path.basename(c) for c in checkpoints_to_delete]}")
+        for ckpt_path in checkpoints_to_delete:
+            if os.path.isdir(ckpt_path): # Ensure it's a directory
+                 try:
+                     shutil.rmtree(ckpt_path)
+                 except OSError as e:
+                     logger.error(f"Error removing directory {ckpt_path}: {e}")
+    # Cleanup HF checkpoints (step-*-hf)
+    hf_checkpoints = sorted(
+        glob.glob(os.path.join(checkpoint_dir, "step-*-hf")),
+         key=lambda x: int(re.search(r"step-(\d+)-hf", os.path.basename(x)).group(1)) if re.search(r"step-(\d+)-hf", os.path.basename(x)) else -1,
+        reverse=True
+    )
+    if len(hf_checkpoints) > keep_latest_k:
+        checkpoints_to_delete = hf_checkpoints[keep_latest_k:]
+        logger.info(f"Deleting {len(checkpoints_to_delete)} old HF checkpoints: {[os.path.basename(c) for c in checkpoints_to_delete]}")
+        for ckpt_path in checkpoints_to_delete:
+             if os.path.isdir(ckpt_path): # Ensure it's a directory
+                 try:
+                     shutil.rmtree(ckpt_path)
+                 except OSError as e:
+                     logger.error(f"Error removing directory {ckpt_path}: {e}")

flame/utils/convert_dcp_to_hf.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import argparse
+import io
+import os
+import tempfile
+from datetime import timedelta
+import torch
+import torch.serialization
+from torch.distributed.checkpoint.format_utils import dcp_to_torch_save
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+import fla  # noqa
+from torchtitan.tools.logging import init_logger, logger
+@torch.inference_mode()
+def save_pretrained(
+    path: str,
+    step: int,
+    config: str,
+    tokenizer: str
+):
+    logger.info(f"Loading the config from {config}")
+    config = AutoConfig.from_pretrained(config, trust_remote_code=True)
+    logger.info(f"Saving the config to {path}")
+    config.save_pretrained(path)
+    logger.info(f"Loading the tokenizer from {tokenizer}")
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer, trust_remote_code=True)
+    logger.info(f"Saving the tokenizer to {path}")
+    tokenizer.save_pretrained(path)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # base_checkpoint_dir = os.path.dirname(path)
+        base_checkpoint_dir = path
+        checkpoint = os.path.join(base_checkpoint_dir, f'checkpoint/step-{step}')
+        checkpoint_path = os.path.join(tmpdir, 'checkpoint.pt')
+        logger.info(f"Saving the distributed checkpoint to {checkpoint_path}")
+        dcp_to_torch_save(checkpoint, checkpoint_path)
+        logger.info(f"Initializing the model from config\n{config}")
+        model = AutoModelForCausalLM.from_config(config)
+        logger.info(model)
+        logger.info("Loading state dict from the checkpoint")
+        # Add datetime.timedelta and io.BytesIO to safe globals
+        torch.serialization.add_safe_globals([timedelta, io.BytesIO])
+        # torch.load now with default weights_only=True will work
+        model.load_state_dict(torch.load(checkpoint_path, map_location='cpu')['model'])
+        logger.info(f"Saving the model to {path}")
+        model.save_pretrained(path)
+if __name__ == "__main__":
+    init_logger()
+    parser = argparse.ArgumentParser("Convert DCP format model weights to huggingface-style.")
+    parser.add_argument("--path", type=str, required=True)
+    parser.add_argument("--step", type=int, required=True)
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--tokenizer", type=str, required=True)
+    args = parser.parse_args()
+    save_pretrained(args.path, args.step, args.config, args.tokenizer)

flame/utils/convert_hf_to_dcp.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import argparse
+from pathlib import Path
+import torch
+import torch.distributed.checkpoint as DCP
+from transformers import AutoModelForCausalLM
+import fla  # noqa
+from torchtitan.tools.logging import init_logger, logger
+@torch.inference_mode()
+def convert_hf_weights(model: str, checkpoint: str):
+    logger.info(f"Loading model from {model}")
+    model = AutoModelForCausalLM.from_pretrained(model)
+    state_dict = model.state_dict()
+    logger.info(f"Writing to DCP at '{checkpoint}'")
+    checkpoint.mkdir(parents=True, exist_ok=True)
+    storage_writer = DCP.filesystem.FileSystemWriter(checkpoint, thread_count=8)
+    DCP.save({"model": state_dict}, storage_writer=storage_writer)
+if __name__ == "__main__":
+    init_logger()
+    parser = argparse.ArgumentParser(description="Convert huggingface-style model weights to DCP format.")
+    parser.add_argument("--model", type=str, required=True)
+    parser.add_argument("--checkpoint", type=Path, required=True)
+    args = parser.parse_args()
+    convert_hf_weights(args.model, args.checkpoint)

flame/utils/hf_utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import re
+from huggingface_hub import HfApi, HfFolder, logging as hf_logging, create_repo
+from torchtitan.tools.logging import logger
+def upload_checkpoint_to_hf(
+    local_path: str,
+    step: int,
+    hf_repo_id_for_run: str,
+    hf_keep_latest_k: int,
+    upload_format: str
+):
+    """Uploads a checkpoint directory to HF Hub and manages retention."""
+    if not os.path.isdir(local_path):
+        logger.error(f"Local path for upload does not exist or is not a directory: {local_path}")
+        return
+    api = HfApi()
+    token = HfFolder.get_token()
+    if not token:
+        logger.warning("Hugging Face Hub token not found. Skipping upload. Login via `huggingface-cli login` or set HF_TOKEN.")
+        return
+    # --- Ensure the specific repository for this run exists ---
+    try:
+        logger.info(f"Ensuring repository {hf_repo_id_for_run} exists...")
+        # Use create_repo which handles creation only if it doesn't exist
+        create_repo(repo_id=hf_repo_id_for_run, token=token, repo_type="model", exist_ok=True)
+        logger.info(f"Repository {hf_repo_id_for_run} ensured.")
+    except Exception as e:
+        logger.error(f"Failed to create or ensure repository {hf_repo_id_for_run}: {e}", exc_info=True)
+        return # Stop if repo interaction fails
+    commit_message = f"Upload {upload_format.upper()} checkpoint step {step}"
+    path_in_repo = f"step-{step}"
+    logger.info(f"Uploading {local_path} to {hf_repo_id_for_run}/{path_in_repo} on Hugging Face Hub...")
+    try:
+        api.upload_folder(
+            folder_path=local_path,
+            path_in_repo=path_in_repo,
+            repo_id=hf_repo_id_for_run,
+            repo_type="model",
+            commit_message=commit_message,
+            token=token,
+        )
+        logger.info(f"Successfully uploaded step {step} to {hf_repo_id_for_run}.")
+    except Exception as e:
+        logger.error(f"Failed to upload checkpoint step {step} to {hf_repo_id_for_run}: {e}", exc_info=True)
+    if hf_keep_latest_k > 0:
+        logger.info(f"Cleaning up old checkpoints on {hf_repo_id_for_run}, keeping latest {hf_keep_latest_k}")
+        try:
+            repo_files = api.list_repo_tree(hf_repo_id_for_run, repo_type="model", token=token, recursive=False)
+            step_folders = [
+                item.path for item in repo_files
+                if item.path.startswith("step-") and item.path[5:].isdigit()
+            ]
+            step_folders.sort(key=lambda x: int(x.split('-')[1]), reverse=True)
+            if len(step_folders) > hf_keep_latest_k:
+                folders_to_delete = step_folders[hf_keep_latest_k:]
+                logger.info(f"Found {len(step_folders)} checkpoints on Hub. Deleting {len(folders_to_delete)} older ones: {folders_to_delete}")
+                for folder in folders_to_delete:
+                    # Deleting requires repo_id, path_in_repo, and token
+                    api.delete_folder(
+                        repo_id=hf_repo_id_for_run,
+                        path_in_repo=folder,
+                        repo_type="model",
+                        commit_message=f"Delete old checkpoint {folder}",
+                        token=token
+                    )
+                logger.info("Hub cleanup complete.")
+            else:
+                logger.info("No old checkpoints found on Hub to delete.")
+        except Exception as e:
+            logger.error(f"Error during Hub checkpoint cleanup for {hf_repo_id_for_run}: {e}", exc_info=True)

logs/none_ewbp5xc1/attempt_0/1/stderr.log ADDED Viewed