zaydzuhri commited on Sep 11, 2025

Commit

86c6113

verified ·

1 Parent(s): 0fa019d

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

flame/__pycache__/__init__.cpython-312.pyc +0 -0
flame/__pycache__/config_manager.cpython-312.pyc +0 -0
flame/components/__pycache__/checkpoint.cpython-312.pyc +0 -0
flame/components/checkpoint.py +59 -0
flame/models/__init__.py +0 -0
flame/models/__pycache__/__init__.cpython-312.pyc +0 -0
flame/models/fla.toml +67 -0
flame/models/parallelize_fla.py +550 -0
flame/models/pipeline_fla.py +162 -0
flame/tools/__pycache__/utils.cpython-312.pyc +0 -0
flame/tools/utils.py +41 -0
flame/utils/__init__.py +0 -0
flame/utils/__pycache__/__init__.cpython-312.pyc +0 -0
flame/utils/__pycache__/convert_dcp_to_hf.cpython-312.pyc +0 -0
flame/utils/convert_dcp_to_hf.py +66 -0
flame/utils/convert_hf_to_dcp.py +34 -0
flame/utils/hf_utils.py +77 -0
logs/none_g37i6vbo/attempt_0/6/stderr.log +0 -0
logs/none_lyv0rec_/attempt_0/0/stdout.log +33 -0
logs/none_lyv0rec_/attempt_0/7/stderr.log +0 -0
logs/none_lyv0rec_/attempt_0/7/stdout.log +0 -0
tb/20250909-0619/wandb/debug.log +21 -0
tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/output.log +0 -0
tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/requirements.txt +207 -0
tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log +10 -0
tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log +21 -0
torchtitan/components/__pycache__/dataloader.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/metrics.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/tokenizer.cpython-312.pyc +0 -0
torchtitan/components/metrics.py +435 -0
torchtitan/experiments/deepseek_v3/LICENSE-CODE +21 -0
torchtitan/experiments/deepseek_v3/README.md +40 -0
torchtitan/experiments/deepseek_v3/checkpoint.py +154 -0
torchtitan/experiments/deepseek_v3/download.py +70 -0
torchtitan/experiments/deepseek_v3/model.py +1325 -0
torchtitan/experiments/deepseek_v3/requirements.txt +5 -0
torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_utils.py +63 -0
torchtitan/experiments/flux/README.md +23 -0
torchtitan/experiments/flux/__pycache__/__init__.cpython-312.pyc +0 -0
torchtitan/experiments/flux/dataset/flux_dataset.py +267 -0
torchtitan/experiments/flux/dataset/tokenizer.py +64 -0
torchtitan/experiments/flux/model/__pycache__/layers.cpython-312.pyc +0 -0
torchtitan/experiments/flux/model/hf_embedder.py +40 -0
torchtitan/experiments/flux/model/model.py +177 -0
torchtitan/experiments/flux/tests/test_flux_dataloader.py +103 -0
torchtitan/experiments/flux/tests/test_generate_image.py +252 -0
torchtitan/experiments/flux/train_configs/debug_model.toml +68 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/benchmark.py +630 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py +82 -0

flame/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (156 Bytes). View file

flame/__pycache__/config_manager.cpython-312.pyc ADDED Viewed

Binary file (36.9 kB). View file

flame/components/__pycache__/checkpoint.cpython-312.pyc ADDED Viewed

Binary file (3.21 kB). View file

flame/components/checkpoint.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass, field
+from datetime import timedelta
+from io import BytesIO
+from typing import Any, Dict, List
+import torch
+from torch.distributed.checkpoint.stateful import Stateful
+@dataclass
+class TrainState(Stateful):
+    step: int = 0
+    skipped_step: int = 0
+    token: int = 0
+    elapsed: timedelta = timedelta(0)
+    global_avg_losses: List[float] = field(default_factory=list)
+    global_max_losses: List[float] = field(default_factory=list)
+    log_steps: List[int] = field(default_factory=list)
+    def state_dict(self) -> Dict[str, Any]:
+        # Only checkpoint global_avg_losses and global_max_losses per log frequency
+        # to avoid sync overhead in every iteration.
+        global_avg_losses_bytes = BytesIO()
+        torch.save(self.global_avg_losses, global_avg_losses_bytes)
+        global_max_losses_bytes = BytesIO()
+        torch.save(self.global_max_losses, global_max_losses_bytes)
+        log_steps_bytes = BytesIO()
+        torch.save(self.log_steps, log_steps_bytes)
+        return {
+            "step": torch.tensor(self.step, dtype=torch.int32),
+            "skipped_step": torch.tensor(self.skipped_step, dtype=torch.int32),
+            "token": torch.tensor(self.token, dtype=torch.int64),
+            "elapsed": self.elapsed,
+            "global_avg_losses": global_avg_losses_bytes,
+            "global_max_losses": global_max_losses_bytes,
+            "log_steps": log_steps_bytes,
+        }
+    def load_state_dict(self, state_dict) -> None:
+        self.step = state_dict["step"].item()
+        self.skipped_step = state_dict.get("skipped_step", 0).item()
+        self.token = state_dict["token"].item()
+        self.elapsed = state_dict["elapsed"]
+        state_dict["global_avg_losses"].seek(0)
+        self.global_avg_losses = torch.load(
+            state_dict["global_avg_losses"], weights_only=False
+        )
+        state_dict["global_max_losses"].seek(0)
+        self.global_max_losses = torch.load(
+            state_dict["global_max_losses"], weights_only=False
+        )
+        state_dict["log_steps"].seek(0)
+        self.log_steps = torch.load(state_dict["log_steps"], weights_only=False)

flame/models/__init__.py ADDED Viewed

File without changes

flame/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (137 Bytes). View file

flame/models/fla.toml ADDED Viewed

	@@ -0,0 +1,67 @@

+[model]
+config = "fla-hub/transformer-1.3B-100B"
+tokenizer_path = "fla-hub/transformer-1.3B-100B"
+[job]
+dump_folder = "exp"
+print_args = true
+[training]
+batch_size = 32
+seq_len = 2048
+context_len = 2048
+gradient_accumulation_steps = 1
+steps = 20480
+max_norm = 1.0
+skip_nan_inf = true
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = -1
+tensor_parallel_degree = 1
+compile = false
+dataset = "HuggingFaceFW/fineweb-edu"
+dataset_name = "default"
+num_workers = 32
+pin_memory = false
+persistent_workers = false
+prefetch_factor = 2
+seed = 42
+varlen = false
+[optimizer]
+name = "AdamW"
+eps = 1e-15
+lr = 3e-4
+[lr_scheduler]
+warmup_steps = 1024
+decay_type = "cosine"
+lr_min = 0.1
+[checkpoint]
+enable_checkpoint = true
+folder = "checkpoint"
+interval_type = "steps"
+interval = 2048
+model_weights_only = false
+export_dtype = "float32"
+async_mode = "disabled"    # ["disabled", "async", "async_with_pinned_mem"]
+[profiling]
+enable_profiling = true
+save_traces_folder = "profile_trace"
+profile_freq = 512
+[metrics]
+log_freq = 32
+enable_wandb = true
+[experimental]
+context_parallel_degree = 1
+pipeline_parallel_degree = 1
+[float8]
+enable_fsdp_float8_all_gather = false
+precompute_float8_dynamic_scale_for_fsdp = false
+[activation_checkpoint]
+mode = "none"

flame/models/parallelize_fla.py ADDED Viewed

	@@ -0,0 +1,550 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This file applies the PT-D parallelisms (except pipeline parallelism) and various
+# training techniques (e.g. activation checkpointing and compile) to the Llama model.
+from collections import defaultdict
+import torch
+import torch.nn as nn
+from torch.distributed import DeviceMesh
+from torch.distributed._composable.fsdp import CPUOffloadPolicy, MixedPrecisionPolicy, fully_shard
+from torch.distributed._composable.replicate import replicate
+from torch.distributed._tensor import Replicate, Shard
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import checkpoint_wrapper as ptd_checkpoint_wrapper
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    PrepareModuleOutput,
+    RowwiseParallel,
+    SequenceParallel,
+    parallelize_module
+)
+from fla.modules.fused_linear_cross_entropy import LinearLossParallel
+from fla.modules.mlp import SwiGLULinearParallel
+from fla.modules.parallel import PrepareModuleWeight
+from torchtitan.config_manager import TORCH_DTYPE_MAP, JobConfig
+from torchtitan.distributed.parallel_dims import ParallelDims
+from torchtitan.tools.logging import logger
+def parallelize_fla(
+    model: nn.Module,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+    """
+    Apply tensor parallelism, activation checkpointing, torch.compile, and data
+    parallelism to the model.
+    NOTE: The passed-in model preferably should be on meta device. Otherwise,
+    the model must fit on GPU or CPU memory.
+    """
+    if parallel_dims.tp_enabled:
+        if (
+            job_config.experimental.enable_async_tensor_parallel
+            and not job_config.training.compile
+        ):
+            raise RuntimeError("Async TP requires --training.compile")
+        enable_float8_linear = "float8" in job_config.model.converters
+        apply_tp(
+            model,
+            world_mesh["tp"],
+            loss_parallel=parallel_dims.loss_parallel_enabled,
+            enable_float8=enable_float8_linear,
+            enable_async_tp=job_config.experimental.enable_async_tensor_parallel,
+        )
+    if job_config.activation_checkpoint.mode != "none":
+        apply_ac(model, job_config.activation_checkpoint)
+    # turn on per-block compile after AC wrapping and before FSDP
+    if job_config.training.compile:
+        apply_compile(model)
+    if (
+        parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled
+    ):  # apply FSDP or HSDP, potentially with Context Parallel
+        if parallel_dims.dp_replicate_enabled:
+            dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+        else:
+            dp_mesh_dim_names = ("dp_shard_cp",)
+        apply_fsdp(
+            model,
+            world_mesh[tuple(dp_mesh_dim_names)],
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
+            pp_enabled=parallel_dims.pp_enabled,
+            cpu_offload=job_config.training.enable_cpu_offload,
+            reshard_after_forward_policy=job_config.training.fsdp_reshard_after_forward,
+        )
+        if parallel_dims.dp_replicate_enabled:
+            logger.info("Applied HSDP to the model")
+        else:
+            logger.info("Applied FSDP to the model")
+        if parallel_dims.cp_enabled:
+            logger.info("Applied Context Parallel to the model")
+        if job_config.training.enable_cpu_offload:
+            logger.info("Applied CPU Offloading to the model")
+    elif parallel_dims.dp_replicate_enabled:
+        if world_mesh.ndim > 1:
+            raise RuntimeError("DDP has not supported > 1D parallelism")
+        apply_ddp(
+            model,
+            world_mesh,
+            enable_compile=job_config.training.compile,
+            enable_compiled_autograd=job_config.experimental.enable_compiled_autograd,
+        )
+class TPPlan:
+    def __init__(
+        self,
+        model=None,
+        loss_parallel=False,
+        enable_float8=False,
+    ):
+        self.model = model
+        self.loss_parallel = loss_parallel
+        self.enable_float8 = enable_float8
+        self.base_model_prefix = getattr(model, "base_model_prefix", "model")
+        # TODO(vkuzo): once float8 configuration supports delayed scaling,
+        # add a check here to enforce supported float8 all-gather configurations
+        # TODO(vkuzo): add the items below to __init__.py of torchao.float8 and import from there
+        try:
+            from torchao.float8.float8_tensor_parallel import (
+                Float8ColwiseParallel,
+                Float8RowwiseParallel,
+                PrepareFloat8ModuleInput
+            )
+        except ImportError:
+            Float8ColwiseParallel = None
+            Float8RowwiseParallel = None
+            PrepareFloat8ModuleInput = None
+        if self.enable_float8 and Float8ColwiseParallel is not None:
+            self.rowwise_parallel = Float8RowwiseParallel
+            self.colwise_parallel = Float8ColwiseParallel
+            self.prepare_module_input = PrepareFloat8ModuleInput
+            self.prepare_module_output = PrepareModuleOutput
+        else:
+            self.rowwise_parallel = RowwiseParallel
+            self.colwise_parallel = ColwiseParallel
+            self.prepare_module_input = PrepareModuleInput
+            self.prepare_module_output = PrepareModuleOutput
+    @property
+    def model_plan(self):
+        plans = {
+            f"{self.base_model_prefix}.embeddings": RowwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(1),
+            ),
+            f"{self.base_model_prefix}.norm": SequenceParallel(),
+        }
+        if self.loss_parallel:
+            plans.update(
+                {
+                    "lm_head": ColwiseParallel(
+                        input_layouts=Shard(1),
+                        output_layouts=Shard(-1) if self.loss_parallel else Replicate(),
+                        use_local_output=not self.loss_parallel,
+                    ),
+                }
+            )
+        else:
+            plans.update(
+                {
+                    "lm_head": PrepareModuleWeight(layouts=Replicate()),
+                    "criterion": LinearLossParallel(),
+                }
+            )
+        return plans
+    @property
+    def layer_plan(self):
+        return {
+            "attn_norm": SequenceParallel(),
+            **self.attn_plan,
+            "mlp_norm": SequenceParallel(),
+            **self.mlp_plan,
+        }
+    @property
+    def attn_plan(self):
+        raise NotImplementedError(
+            f"TP plans for token mixing layers of {self.model.config.model_type} not implemented"
+        )
+    @property
+    def mlp_plan(self):
+        return {
+            "mlp": self.prepare_module_input(
+                input_layouts=(Shard(1),),
+                desired_input_layouts=(Replicate(),),
+            ),
+            "mlp.gate_proj": self.colwise_parallel(),
+            "mlp.up_proj": self.colwise_parallel(),
+            "mlp.down_proj": self.rowwise_parallel(output_layouts=Shard(1)),
+            "mlp.swiglu_linear": SwiGLULinearParallel(output_layouts=Shard(1)),
+        }
+class TransformerTPPlan(TPPlan):
+    @property
+    def attn_plan(self):
+        return {
+            "attn": self.prepare_module_input(
+                input_kwarg_layouts={"hidden_states": Shard(1)},
+                desired_input_kwarg_layouts={"hidden_states": Replicate()},
+            ),
+            "attn.q_proj": self.colwise_parallel(),
+            "attn.k_proj": self.colwise_parallel(),
+            "attn.v_proj": self.colwise_parallel(),
+            "attn.o_proj": self.rowwise_parallel(output_layouts=Shard(1)),
+        }
+class GLATPPlan(TPPlan):
+    @property
+    def attn_plan(self):
+        return {
+            "attn": self.prepare_module_input(
+                input_kwarg_layouts={"hidden_states": Shard(1)},
+                desired_input_kwarg_layouts={"hidden_states": Replicate()},
+            ),
+            "attn.q_proj": self.colwise_parallel(),
+            "attn.k_proj": self.colwise_parallel(),
+            "attn.v_proj": self.colwise_parallel(),
+            "attn.g_proj": self.colwise_parallel(),
+            "attn.gk_proj.0": PrepareModuleWeight(layouts=Replicate()),
+            "attn.gk_proj.1": self.colwise_parallel(),
+            "attn.g_norm": SequenceParallel(sequence_dim=-1),
+            "attn.o_proj": self.rowwise_parallel(output_layouts=Shard(1)),
+        }
+TP_PLAN_MAP = {"transformer": TransformerTPPlan, "gla": GLATPPlan}
+def apply_tp(
+    model: nn.Module,
+    tp_mesh: DeviceMesh,
+    loss_parallel: bool,
+    enable_float8: bool,
+    enable_async_tp: bool,
+):
+    """Apply tensor parallelism."""
+    # 1. Parallelize the embedding and shard its outputs (which are the first
+    # transformer block's inputs)
+    # 2. Parallelize the root norm layer over the sequence dim
+    # 3. Parallelize the final linear output layer
+    tp_plan = TP_PLAN_MAP[model.config.model_type](
+        model, loss_parallel=loss_parallel, enable_float8=enable_float8
+    )
+    parallelize_module(model, tp_mesh, tp_plan.model_plan)
+    blocks = get_blocks(model)
+    if blocks is None:
+        logger.warning("No block found for tensor parallelism")
+    else:
+        for _, block in enumerate(blocks):
+            parallelize_module(
+                module=block,
+                device_mesh=tp_mesh,
+                parallelize_plan=tp_plan.layer_plan,
+            )
+    if enable_async_tp:
+        from torch.distributed._symmetric_memory import enable_symm_mem_for_group
+        torch._inductor.config._micro_pipeline_tp = True
+        enable_symm_mem_for_group(tp_mesh.get_group().group_name)
+    logger.info(
+        f"Applied {'Float8 ' if enable_float8 else ''}{'Async ' if enable_async_tp else ''}"
+        "Tensor Parallelism to the model"
+    )
+# for selective op activation checkpointing
+_save_list = {
+    torch.ops.aten.mm.default,
+    torch.ops.aten._scaled_dot_product_efficient_attention.default,
+    torch.ops.aten._scaled_dot_product_flash_attention.default,
+    torch.ops._c10d_functional.reduce_scatter_tensor.default,
+    # for low precision training, it's useful to always save
+    # the result of max, since the absolute maximum is
+    # used to compute the scaling factor for quantization.
+    torch.ops.aten.max.default,
+}
+def _apply_ac_to_block(module: nn.Module, ac_config):
+    valid_ac_modes = ("full", "selective")
+    if ac_config.mode not in valid_ac_modes:
+        raise ValueError(
+            f"Invalid AC mode: {ac_config.mode}. Valid modes: {valid_ac_modes}"
+        )
+    if ac_config.mode == "full":
+        return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+    assert ac_config.mode == "selective", f"{ac_config.mode}"
+    use_op_sac = ac_config.selective_ac_option == "op"
+    use_layer_sac = ac_config.selective_ac_option.isdigit()
+    if not use_op_sac and not use_layer_sac:
+        raise ValueError(
+            f"Invalid selective AC option: {ac_config.selective_ac_option}. "
+            f"Valid options: 'op' or a positive int representing layer frequency"
+        )
+    if use_op_sac:
+        from torch.utils.checkpoint import CheckpointPolicy, create_selective_checkpoint_contexts
+        def _get_custom_policy(meta):
+            def _custom_policy(ctx, func, *args, **kwargs):
+                mode = "recompute" if ctx.is_recompute else "forward"
+                mm_count_key = f"{mode}_mm_count"
+                if func == torch.ops.aten.mm.default:
+                    meta[mm_count_key] += 1
+                # Saves output of all compute ops, except every second mm
+                to_save = func in _save_list and not (
+                    func == torch.ops.aten.mm.default and meta[mm_count_key] % 2 == 0
+                )
+                return (
+                    CheckpointPolicy.MUST_SAVE
+                    if to_save
+                    else CheckpointPolicy.PREFER_RECOMPUTE
+                )
+            return _custom_policy
+        def selective_checkpointing_context_fn():
+            meta = defaultdict(int)
+            return create_selective_checkpoint_contexts(_get_custom_policy(meta))
+        return ptd_checkpoint_wrapper(
+            module,
+            context_fn=selective_checkpointing_context_fn,
+            preserve_rng_state=False,
+        )
+    elif use_layer_sac:
+        # Checkpoint every `ac_freq` of the modules passed to this function
+        ac_freq = int(ac_config.selective_ac_option)
+        ptd_checkpoint_wrapper.__dict__.setdefault("_count", 0)
+        ptd_checkpoint_wrapper._count += 1
+        if not ac_freq or ptd_checkpoint_wrapper._count % ac_freq == 0:
+            return ptd_checkpoint_wrapper(module, preserve_rng_state=False)
+        else:
+            return module
+def apply_ac(model: nn.Module, ac_config):
+    """Apply activation checkpointing to the model."""
+    blocks = get_blocks(model)
+    if blocks is None:
+        logger.warning("No block found for activation checkpointing")
+        return
+    for layer_id, block in blocks.named_children():
+        block = _apply_ac_to_block(block, ac_config)
+        blocks.register_module(layer_id, block)
+    logger.info(f"Applied {ac_config.mode} activation checkpointing to the model")
+def apply_compile(model: nn.Module):
+    """
+    Apply torch.compile to each block, which makes compilation efficient due to
+    repeated structure. Alternatively one can compile the whole model (after applying DP).
+    """
+    blocks = get_blocks(model)
+    if blocks is None:
+        logger.warning("No block found for torch.compile")
+    else:
+        for layer_id, block in blocks.named_children():
+            block = torch.compile(block)
+            blocks.register_module(layer_id, block)
+        logger.info("Compiling each block with torch.compile")
+    real_model = get_model(model)
+    logger.info("Compiling the embedding, norm, and lm_head layers with torch.compile")
+    embeddings_key = get_components_name(real_model, "tok_embeddings")
+    if embeddings_key is not None:
+        embeddings = torch.compile(getattr(real_model, embeddings_key), fullgraph=True)
+        real_model.register_module(embeddings_key, embeddings)
+    norm_key = get_components_name(real_model, "norm")
+    if norm_key is not None:
+        norm = torch.compile(getattr(real_model, norm_key), fullgraph=True)
+        real_model.register_module(norm_key, norm)
+    lm_head_key = get_components_name(model, "lm_head")
+    if lm_head_key is not None:
+        lm_head = torch.compile(getattr(model, lm_head_key), fullgraph=True)
+        model.register_module(lm_head_key, lm_head)
+    logger.info("Compiling the entire model with torch.compile")
+    model = torch.compile(model)
+def apply_fsdp(
+    model: nn.Module,
+    dp_mesh: DeviceMesh,
+    param_dtype: torch.dtype,
+    reduce_dtype: torch.dtype,
+    pp_enabled: bool,
+    cpu_offload: bool = False,
+    reshard_after_forward_policy: str = "default",
+):
+    """
+    Apply data parallelism (via FSDP2) to the model.
+    Args:
+        model (nn.Module): The model to apply data parallelism to.
+        dp_mesh (DeviceMesh): The device mesh to use for data parallelism.
+        param_dtype (torch.dtype): The data type to use for model parameters.
+        reduce_dtype (torch.dtype): The data type to use for reduction operations.
+        pp_enabled (bool): Whether pipeline parallelism is enabled.
+        cpu_offload (bool, optional): Whether to offload model parameters to CPU. Defaults to False.
+        reshard_after_forward_policy (str, optional):
+            The policy to use for resharding after forward pass. Defaults to "default".
+            Other options: "never", "always".
+            - "default" applies default resharding behavior, implementing "smart defaults" for known optimal scenarios.
+            - "always" will enable `reshard_after_forward` for all forward passes.
+            - "never" will disable `reshard_after_forward` for all forward passes.
+    """
+    mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype)
+    fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy}
+    if cpu_offload:
+        fsdp_config["offload_policy"] = CPUOffloadPolicy()
+    blocks = get_blocks(model)
+    if blocks is None:
+        logger.warning("No block found for FSDP")
+    else:
+        total_blocks = len(blocks)
+        for layer_id, block in enumerate(blocks):
+            if reshard_after_forward_policy == "always":
+                reshard_after_forward = True
+            elif reshard_after_forward_policy == "never":
+                reshard_after_forward = False
+            elif reshard_after_forward_policy == "default":
+                if pp_enabled:
+                    # For PP, do not reshard after forward to avoid per-microbatch
+                    # all-gathers, which can be expensive and non-overlapped
+                    reshard_after_forward = False
+                else:
+                    # As an optimization, do not reshard after forward for the last
+                    # transformer block since FSDP would prefetch it immediately
+                    reshard_after_forward = int(layer_id) < total_blocks - 1
+            else:
+                raise ValueError(
+                    f"Invalid reshard_after_forward_policy: {reshard_after_forward_policy}."
+                )
+            fully_shard(
+                block,
+                **fsdp_config,
+                reshard_after_forward=reshard_after_forward,
+            )
+    fully_shard(model, **fsdp_config, reshard_after_forward=not pp_enabled)
+def apply_ddp(
+    model: nn.Module,
+    dp_mesh: DeviceMesh,
+    enable_compile: bool,
+    enable_compiled_autograd: bool,
+):
+    if enable_compile:
+        if enable_compiled_autograd:
+            torch._dynamo.config.optimize_ddp = (
+                "python_reducer_without_compiled_forward"
+            )
+        else:
+            torch._dynamo.config.optimize_ddp = "ddp_optimizer"
+    replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100)
+    logger.info("Applied DDP to the model")
+def get_model(model):
+    base_model_prefix = getattr(model, "base_model_prefix", "model")
+    if not hasattr(model, base_model_prefix):
+        return None
+    model = getattr(model, base_model_prefix)
+    return model
+def get_blocks(model):
+    # TODO[flame]: adapt for network not using 'layers' attribute
+    model = get_model(model)
+    if not hasattr(model, "layers"):
+        logger.warning('no "layers" in model can be found')
+        return None
+    return model.layers
+def get_components_name(model, component_name):
+    """
+    We try to catch tok_embeddings, norm layers and lm_head layers
+    We do not catch the layer names in the blocks, for blocks see `get_blocks`
+    We assume the model has the following structure:
+    LlamaForCausalLM:
+        Model:
+            embed_tokens,
+            layers,
+            norm,
+        lm_head
+    ***
+    so, to search 'tok_embeddings' and 'norm' we need to pass `get_model(model)`
+    and for 'lm_head' we need to pass `model`
+    ***
+    """
+    if component_name == "tok_embeddings":
+        if hasattr(model, "tok_embeddings"):
+            return "tok_embeddings"
+        elif hasattr(model, "embed_tokens"):
+            return "embed_tokens"
+        elif hasattr(model, "embeddings"):
+            return "embeddings"
+        else:
+            logger.warning("No tok_embeddings found in model")
+            return None
+    elif component_name == "norm":
+        if hasattr(model, "norm"):
+            return "norm"
+        elif hasattr(model, "norms"):
+            return "norms"
+        elif hasattr(model, "layernorm"):
+            return "layernorm"
+        else:
+            logger.warning("No norm found in model")
+            return None
+    elif component_name == "lm_head":
+        if hasattr(model, "lm_head"):
+            return "lm_head"
+        else:
+            logger.warning("No lm_head found in model")
+            return None

flame/models/pipeline_fla.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This file applies the PT-D pipeline parallelism to the Llama model.
+import copy
+from typing import Callable, Optional, Union
+import torch
+import torch.nn as nn
+from torch.distributed import DeviceMesh
+from torch.distributed.pipelining import PipelineStage
+from torch.distributed.pipelining.schedules import ScheduleZBVZeroBubble, _PipelineSchedule, get_schedule_class
+from transformers import PretrainedConfig
+from flame.models.parallelize_fla import get_blocks, get_components_name, get_model
+from torchtitan.config_manager import JobConfig
+from torchtitan.distributed.parallel_dims import ParallelDims
+from torchtitan.distributed.pipeline import build_pipeline_schedule, generate_split_points, stage_ids_this_rank
+from torchtitan.tools.logging import logger
+DeviceType = Union[int, str, torch.device]
+def pipeline_fla(
+    model: nn.Module,
+    pp_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+    device: DeviceType,
+    model_config: PretrainedConfig,
+    loss_fn: Callable[..., torch.Tensor],
+) -> tuple[_PipelineSchedule, list[nn.Module], bool, bool]:
+    stages, models = pipeline_fla_manual_split(
+        model, pp_mesh, parallel_dims, job_config, device, model_config
+    )
+    pp_schedule = build_pipeline_schedule(job_config, stages, loss_fn)
+    # This is used in the train loop to determine whether to pass in the input_ids and labels
+    has_first_stage = False
+    has_last_stage = False
+    for stage in stages:
+        if stage.is_first:
+            has_first_stage = True
+        if stage.is_last:
+            has_last_stage = True
+    return pp_schedule, models, has_first_stage, has_last_stage
+def pipeline_fla_manual_split(
+    whole_model: nn.Module,
+    pp_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+    device: DeviceType,
+    model_config: PretrainedConfig,
+) -> tuple[list[PipelineStage], list[nn.Module]]:
+    """
+    This API extracts one torch.nn.Module objects for the part of the model configured to run inside this stage.
+    It wraps the model chunk in a ManualPipelineStage object and returns both the stage and model objects.
+    The stage object is used to create a pipeline schedule, and the model object can be used for applying SPMD
+    parallelism.
+    """
+    pp_rank = pp_mesh.get_local_rank()
+    pp_size = pp_mesh.size()
+    splits = (
+        job_config.experimental.pipeline_parallel_split_points
+        or generate_split_points(
+            job_config, parallel_dims.pp, model_config.num_hidden_layers
+        )
+    )
+    def _build_stage(
+        stage_idx: int,
+        start_layer: Optional[str],
+        stop_layer: Optional[str],
+        is_first: bool = False,
+        is_last: bool = False,
+    ) -> tuple[PipelineStage, nn.Module]:
+        model = copy.deepcopy(whole_model)
+        if not is_first:
+            # we do `model.tok_embeddings = None` here
+            real_model = get_model(model)
+            tok_embeddings_name = get_components_name(real_model, "tok_embeddings")
+            setattr(real_model, tok_embeddings_name, None)
+        drop_layers = start_layer is not None
+        # Get module dictionary from get_blocks(model)
+        # and Create a list of keys before modifying dictionary
+        module_dict = get_blocks(model)._modules  # Store reference
+        layer_names = list(module_dict.keys())
+        # Iterate over the list of keys instead of `_modules.items()`
+        for name in layer_names:
+            # Dynamically determine prefix (blocks.* or layers.*)
+            prefix = start_layer.split(".")[0] if start_layer else "layers"
+            layer_name = f"{prefix}.{name}"  # Construct the correct name format
+            # Ensure `drop_layers` activation is based on actual naming
+            if layer_name == start_layer:
+                drop_layers = False
+            if layer_name == stop_layer:
+                drop_layers = True
+            # Delete layer if drop_layers is active
+            if drop_layers:
+                del module_dict[name]  # Safe deletion from stored dictionary
+        if not is_last:
+            # we do `model.norm = None` and `model.output = None`
+            real_model = get_model(model)
+            norm_name = get_components_name(real_model, "norm")
+            setattr(real_model, norm_name, None)
+            head_name = get_components_name(model, "lm_head")
+            setattr(model, head_name, None)
+        stage = PipelineStage(
+            model,
+            stage_idx,
+            num_stages,
+            device,
+            group=pp_mesh.get_group("pp"),
+        )
+        return stage, model
+    num_stages = len(splits) + 1
+    stage_idx = pp_rank
+    stages = []
+    models = []
+    schedule_class = get_schedule_class(
+        job_config.experimental.pipeline_parallel_schedule
+    )
+    style = "v" if schedule_class == ScheduleZBVZeroBubble else "loop"
+    for stage_idx in stage_ids_this_rank(pp_rank, pp_size, num_stages, style=style):
+        start_layer = splits[stage_idx - 1] if stage_idx > 0 else None
+        stop_layer = splits[stage_idx] if stage_idx < num_stages - 1 else None
+        stage, model_chunk = _build_stage(
+            stage_idx,
+            start_layer,
+            stop_layer,
+            is_first=stage_idx == 0,
+            is_last=stage_idx == num_stages - 1,
+        )
+        logger.info(
+            f"PP rank {pp_rank} is building stage_idx {stage_idx}"
+            f" with start_layer {start_layer}, stop_layer {stop_layer}"
+        )
+        stages.append(stage)
+        models.append(model_chunk)
+    return stages, models

flame/tools/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (2.14 kB). View file

flame/tools/utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torch import nn
+from torchtitan.tools.logging import logger
+def get_nparams_and_flops(model: nn.Module, model_config, seq_len: int) -> tuple[int, int]:
+    nparams = sum(p.numel() for p in model.parameters())
+    nparams_embedding = sum(
+        sum(p.numel() for p in m.parameters())
+        for m in model.children()
+        if isinstance(m, nn.Embedding)
+    )
+    if hasattr(model_config, "num_heads"):
+        num_heads = model_config.num_heads
+    elif hasattr(model_config, "num_attention_heads"):
+        num_heads = model_config.num_attention_heads
+    else:
+        num_heads = 1
+        logger.warning("num_heads not found in model_config, defaulting to 1. ")
+    l, h, q, t = (
+        model_config.num_hidden_layers,
+        num_heads,
+        model_config.hidden_size // num_heads,
+        seq_len,
+    )
+    # Reasoning behind the factor of 12 for the self-attention part of the formula:
+    # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
+    # 2. the flash attention does 1 more matmul recomputation in the backward
+    #    but recomputation should not be counted in calculating MFU           (+0)
+    # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
+    # 4. we follow the convention and do not account for sparsity in causal attention
+    num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
+    return nparams, num_flops_per_token

flame/utils/__init__.py ADDED Viewed

File without changes

flame/utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (136 Bytes). View file

flame/utils/__pycache__/convert_dcp_to_hf.cpython-312.pyc ADDED Viewed

Binary file (3.73 kB). View file

flame/utils/convert_dcp_to_hf.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import argparse
+import io
+import os
+import tempfile
+from datetime import timedelta
+import torch
+import torch.serialization
+from torch.distributed.checkpoint.format_utils import dcp_to_torch_save
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+import fla  # noqa
+from torchtitan.tools.logging import init_logger, logger
+@torch.inference_mode()
+def save_pretrained(
+    path: str,
+    step: int,
+    config: str,
+    tokenizer: str
+):
+    logger.info(f"Loading the config from {config}")
+    config = AutoConfig.from_pretrained(config, trust_remote_code=True)
+    logger.info(f"Saving the config to {path}")
+    config.save_pretrained(path)
+    logger.info(f"Loading the tokenizer from {tokenizer}")
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer, trust_remote_code=True)
+    logger.info(f"Saving the tokenizer to {path}")
+    tokenizer.save_pretrained(path)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # base_checkpoint_dir = os.path.dirname(path)
+        base_checkpoint_dir = path
+        checkpoint = os.path.join(base_checkpoint_dir, f'checkpoint/step-{step}')
+        checkpoint_path = os.path.join(tmpdir, 'checkpoint.pt')
+        logger.info(f"Saving the distributed checkpoint to {checkpoint_path}")
+        dcp_to_torch_save(checkpoint, checkpoint_path)
+        logger.info(f"Initializing the model from config\n{config}")
+        model = AutoModelForCausalLM.from_config(config)
+        logger.info(model)
+        logger.info("Loading state dict from the checkpoint")
+        # Add datetime.timedelta and io.BytesIO to safe globals
+        torch.serialization.add_safe_globals([timedelta, io.BytesIO])
+        # torch.load now with default weights_only=True will work
+        model.load_state_dict(torch.load(checkpoint_path, map_location='cpu')['model'])
+        logger.info(f"Saving the model to {path}")
+        model.save_pretrained(path)
+if __name__ == "__main__":
+    init_logger()
+    parser = argparse.ArgumentParser("Convert DCP format model weights to huggingface-style.")
+    parser.add_argument("--path", type=str, required=True)
+    parser.add_argument("--step", type=int, required=True)
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--tokenizer", type=str, required=True)
+    args = parser.parse_args()
+    save_pretrained(args.path, args.step, args.config, args.tokenizer)

flame/utils/convert_hf_to_dcp.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import argparse
+from pathlib import Path
+import torch
+import torch.distributed.checkpoint as DCP
+from transformers import AutoModelForCausalLM
+import fla  # noqa
+from torchtitan.tools.logging import init_logger, logger
+@torch.inference_mode()
+def convert_hf_weights(model: str, checkpoint: str):
+    logger.info(f"Loading model from {model}")
+    model = AutoModelForCausalLM.from_pretrained(model)
+    state_dict = model.state_dict()
+    logger.info(f"Writing to DCP at '{checkpoint}'")
+    checkpoint.mkdir(parents=True, exist_ok=True)
+    storage_writer = DCP.filesystem.FileSystemWriter(checkpoint, thread_count=8)
+    DCP.save({"model": state_dict}, storage_writer=storage_writer)
+if __name__ == "__main__":
+    init_logger()
+    parser = argparse.ArgumentParser(description="Convert huggingface-style model weights to DCP format.")
+    parser.add_argument("--model", type=str, required=True)
+    parser.add_argument("--checkpoint", type=Path, required=True)
+    args = parser.parse_args()
+    convert_hf_weights(args.model, args.checkpoint)

flame/utils/hf_utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import re
+from huggingface_hub import HfApi, HfFolder, logging as hf_logging, create_repo
+from torchtitan.tools.logging import logger
+def upload_checkpoint_to_hf(
+    local_path: str,
+    step: int,
+    hf_repo_id_for_run: str,
+    hf_keep_latest_k: int,
+    upload_format: str
+):
+    """Uploads a checkpoint directory to HF Hub and manages retention."""
+    if not os.path.isdir(local_path):
+        logger.error(f"Local path for upload does not exist or is not a directory: {local_path}")
+        return
+    api = HfApi()
+    token = HfFolder.get_token()
+    if not token:
+        logger.warning("Hugging Face Hub token not found. Skipping upload. Login via `huggingface-cli login` or set HF_TOKEN.")
+        return
+    # --- Ensure the specific repository for this run exists ---
+    try:
+        logger.info(f"Ensuring repository {hf_repo_id_for_run} exists...")
+        # Use create_repo which handles creation only if it doesn't exist
+        create_repo(repo_id=hf_repo_id_for_run, token=token, repo_type="model", exist_ok=True)
+        logger.info(f"Repository {hf_repo_id_for_run} ensured.")
+    except Exception as e:
+        logger.error(f"Failed to create or ensure repository {hf_repo_id_for_run}: {e}", exc_info=True)
+        return # Stop if repo interaction fails
+    commit_message = f"Upload {upload_format.upper()} checkpoint step {step}"
+    path_in_repo = f"step-{step}"
+    logger.info(f"Uploading {local_path} to {hf_repo_id_for_run}/{path_in_repo} on Hugging Face Hub...")
+    try:
+        api.upload_folder(
+            folder_path=local_path,
+            path_in_repo=path_in_repo,
+            repo_id=hf_repo_id_for_run,
+            repo_type="model",
+            commit_message=commit_message,
+            token=token,
+        )
+        logger.info(f"Successfully uploaded step {step} to {hf_repo_id_for_run}.")
+    except Exception as e:
+        logger.error(f"Failed to upload checkpoint step {step} to {hf_repo_id_for_run}: {e}", exc_info=True)
+    if hf_keep_latest_k > 0:
+        logger.info(f"Cleaning up old checkpoints on {hf_repo_id_for_run}, keeping latest {hf_keep_latest_k}")
+        try:
+            repo_files = api.list_repo_tree(hf_repo_id_for_run, repo_type="model", token=token, recursive=False)
+            step_folders = [
+                item.path for item in repo_files
+                if item.path.startswith("step-") and item.path[5:].isdigit()
+            ]
+            step_folders.sort(key=lambda x: int(x.split('-')[1]), reverse=True)
+            if len(step_folders) > hf_keep_latest_k:
+                folders_to_delete = step_folders[hf_keep_latest_k:]
+                logger.info(f"Found {len(step_folders)} checkpoints on Hub. Deleting {len(folders_to_delete)} older ones: {folders_to_delete}")
+                for folder in folders_to_delete:
+                    # Deleting requires repo_id, path_in_repo, and token
+                    api.delete_folder(
+                        repo_id=hf_repo_id_for_run,
+                        path_in_repo=folder,
+                        repo_type="model",
+                        commit_message=f"Delete old checkpoint {folder}",
+                        token=token
+                    )
+                logger.info("Hub cleanup complete.")
+            else:
+                logger.info("No old checkpoints found on Hub to delete.")
+        except Exception as e:
+            logger.error(f"Error during Hub checkpoint cleanup for {hf_repo_id_for_run}: {e}", exc_info=True)

logs/none_g37i6vbo/attempt_0/6/stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/none_lyv0rec_/attempt_0/0/stdout.log ADDED Viewed

	@@ -0,0 +1,33 @@

+  [2m2025-09-10T00:25:50.402942Z[0m [33m WARN[0m  [33mStatus Code: 502. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
+    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
+  [2m2025-09-10T00:25:50.448322Z[0m [33m WARN[0m  [33mStatus Code: 502. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
+    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
+  [2m2025-09-10T00:26:01.892901Z[0m [33m WARN[0m  [33mStatus Code: 504. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
+    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
+  [2m2025-09-10T00:26:01.894451Z[0m [33m WARN[0m  [33mStatus Code: 504. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
+    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
+  [2m2025-09-10T00:26:46.358405Z[0m [33m WARN[0m  [33mStatus Code: 504. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
+    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
+  [2m2025-09-10T00:26:50.304225Z[0m [33m WARN[0m  [33mStatus Code: 502. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
+    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
+  [2m2025-09-10T00:27:00.830860Z[0m [33m WARN[0m  [33mStatus Code: 504. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
+    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
+  [2m2025-09-10T00:28:33.662622Z[0m [33m WARN[0m  [33mStatus Code: 502. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
+    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
+  [2m2025-09-10T00:37:21.678500Z[0m [33m WARN[0m  [33mStatus Code: 502. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
+    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
+  [2m2025-09-10T00:37:33.396089Z[0m [33m WARN[0m  [33mStatus Code: 504. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
+    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220
+  [2m2025-09-10T00:38:21.672469Z[0m [33m WARN[0m  [33mStatus Code: 502. Retrying..., [1;33mrequest_id[0m[33m: ""[0m
+    [2;3mat[0m /home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs:220

logs/none_lyv0rec_/attempt_0/7/stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/none_lyv0rec_/attempt_0/7/stdout.log ADDED Viewed

File without changes

tb/20250909-0619/wandb/debug.log ADDED Viewed

	@@ -0,0 +1,21 @@

+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_setup.py:_flush():80] Current SDK version is 0.21.0
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_setup.py:_flush():80] Configure stats pid to 795439
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from /home/cvm/.config/wandb/settings
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from /home/cvm/flame/wandb/settings
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_init.py:setup_run_log_directory():703] Logging user logs to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_init.py:init():830] calling init triggers
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_init.py:init():871] starting backend
+2025-09-09 06:19:20,025 INFO    MainThread:795439 [wandb_init.py:init():874] sending inform_init request
+2025-09-09 06:19:20,027 INFO    MainThread:795439 [wandb_init.py:init():882] backend started and connected
+2025-09-09 06:19:20,033 INFO    MainThread:795439 [wandb_init.py:init():953] updated telemetry
+2025-09-09 06:19:20,039 INFO    MainThread:795439 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
+2025-09-09 06:19:20,682 INFO    MainThread:795439 [wandb_init.py:init():1029] starting run threads in backend
+2025-09-09 06:19:20,815 INFO    MainThread:795439 [wandb_run.py:_console_start():2458] atexit reg
+2025-09-09 06:19:20,815 INFO    MainThread:795439 [wandb_run.py:_redirect():2306] redirect: wrap_raw
+2025-09-09 06:19:20,815 INFO    MainThread:795439 [wandb_run.py:_redirect():2375] Wrapping output streams.
+2025-09-09 06:19:20,815 INFO    MainThread:795439 [wandb_run.py:_redirect():2398] Redirects installed.
+2025-09-09 06:19:20,817 INFO    MainThread:795439 [wandb_init.py:init():1075] run started, returning control to user process

tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/output.log ADDED Viewed

The diff for this file is too large to render. See raw diff

tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,207 @@

+flame==0.1.0
+pluggy==1.6.0
+triton==3.2.0
+sympy==1.13.1
+wcwidth==0.2.13
+nvidia-cusolver-cu12==11.6.1.9
+peft==0.17.0
+smart_open==7.3.0.post1
+cymem==2.0.11
+spacy-legacy==3.0.12
+h11==0.16.0
+pytablewriter==1.2.1
+idna==3.10
+regex==2025.7.34
+antlr4-python3-runtime==4.13.2
+wandb==0.21.0
+nvidia-cuda-cupti-cu12==12.4.127
+sentencepiece==0.2.1
+zstandard==0.23.0
+pybind11==3.0.0
+inquirerpy==0.3.4
+contourpy==1.3.3
+Pygments==2.19.2
+sniffio==1.3.1
+Jinja2==3.1.6
+packaging==25.0
+Markdown==3.8.2
+astunparse==1.6.3
+spacy==3.8.7
+pyparsing==3.2.3
+networkx==3.5
+ninja==1.11.1.4
+tf-slim==1.1.0
+PyYAML==6.0.2
+smmap==5.0.2
+tiktoken==0.9.0
+flatbuffers==25.2.10
+tensorflow==2.20.0
+langcodes==3.5.0
+nvidia-cuda-nvrtc-cu12==12.4.127
+numexpr==2.11.0
+charset-normalizer==3.4.3
+frozenlist==1.7.0
+setuptools==80.9.0
+cycler==0.12.1
+weasel==0.4.1
+tzdata==2025.2
+sacrebleu==2.5.1
+rouge_score==0.1.2
+requests==2.32.5
+nvidia-nvjitlink-cu12==12.4.127
+grpcio==1.74.0
+nvidia-cusparse-cu12==12.3.1.170
+mdurl==0.1.2
+pandas==2.3.1
+preshed==3.0.10
+attrs==25.3.0
+tensorboard-data-server==0.7.2
+aiohappyeyeballs==2.6.1
+keras==3.11.2
+wrapt==1.17.3
+aiosignal==1.4.0
+tcolorpy==0.1.7
+platformdirs==4.3.8
+tqdm-multiprocess==0.0.11
+python-dotenv==1.1.1
+wasabi==1.1.3
+google-pasta==0.2.0
+optree==0.17.0
+MarkupSafe==3.0.2
+colorlog==6.9.0
+nvidia-cufft-cu12==11.2.1.3
+lm_eval==0.4.9.1
+lxml==6.0.0
+protobuf==6.32.0
+radgraph==0.1.18
+scipy==1.16.1
+click==8.2.1
+wheel==0.45.1
+marisa-trie==1.3.0
+pathvalidate==3.3.1
+nvidia-nccl-cu12==2.21.5
+evaluate==0.4.5
+nvidia-cuda-runtime-cu12==12.4.127
+transformers==4.51.3
+aenum==3.1.15
+typing-inspection==0.4.1
+gitdb==4.0.12
+iniconfig==2.1.0
+multidict==6.6.3
+huggingface-hub==0.34.4
+tokenizers==0.21.4
+tabledata==1.3.4
+mbstrdecoder==1.1.4
+Werkzeug==3.1.3
+accelerate==1.10.0
+hf-xet==1.1.8
+tensorboard==2.20.0
+ml_dtypes==0.5.3
+pytest==8.4.1
+namex==0.1.0
+pillow==11.3.0
+datasets==3.6.0
+tqdm==4.67.1
+murmurhash==1.0.13
+fonttools==4.59.1
+absl-py==2.3.1
+multiprocess==0.70.16
+fsspec==2025.3.0
+transformers==4.51.3
+dill==0.3.8
+propcache==0.3.2
+jsonpickle==4.1.1
+BLEURT==0.0.2
+yarl==1.20.1
+portalocker==3.2.0
+httpx==0.27.2
+numpy==2.3.2
+mpmath==1.3.0
+pyarrow==21.0.0
+matplotlib==3.10.5
+typepy==1.3.4
+pycountry==24.6.1
+word2number==1.1
+psutil==7.0.0
+catalogue==2.0.10
+latex2sympy2_extended==1.0.6
+pydantic_core==2.33.2
+threadpoolctl==3.6.0
+spacy-loggers==1.0.5
+certifi==2025.8.3
+confection==0.1.5
+flame==0.1.0
+pfzy==0.3.4
+safetensors==0.6.2
+pip==25.1
+DataProperty==1.1.0
+lighteval==0.10.1.dev0
+jsonlines==4.0.0
+scikit-learn==1.7.1
+torch==2.6.0
+pytz==2025.2
+python-dateutil==2.9.0.post0
+nltk==3.9.1
+sqlitedict==2.1.0
+gast==0.6.0
+nvidia-curand-cu12==10.3.5.147
+rich==14.1.0
+sentry-sdk==2.33.2
+nvidia-cusparselt-cu12==0.6.2
+kiwisolver==1.4.9
+appdirs==1.4.4
+bert-score==0.3.13
+blis==1.3.0
+GitPython==3.1.45
+chardet==5.2.0
+more-itertools==10.7.0
+filelock==3.19.1
+transformers==4.51.3
+httpcore==1.0.9
+termcolor==3.1.0
+typer==0.16.1
+einops==0.8.1
+torchdata==0.11.0
+six==1.17.0
+colorama==0.4.6
+aiohttp==3.12.14
+srsly==2.5.1
+urllib3==2.5.0
+nvidia-cublas-cu12==12.4.5.8
+cloudpathlib==0.21.1
+h5py==3.14.0
+thinc==8.3.6
+markdown-it-py==4.0.0
+flash-attn==2.7.3
+prompt_toolkit==3.0.52
+nvidia-nvtx-cu12==12.4.127
+en_core_web_sm==3.8.0
+xxhash==3.5.0
+anyio==4.10.0
+joblib==1.5.1
+pydantic==2.11.7
+opt_einsum==3.4.0
+dotmap==1.3.30
+language_data==1.3.0
+shellingham==1.5.4
+nvidia-cudnn-cu12==9.1.0.70
+typing_extensions==4.14.1
+libclang==18.1.1
+tabulate==0.9.0
+annotated-types==0.7.0
+jaraco.context==5.3.0
+autocommand==2.2.2
+more-itertools==10.3.0
+tomli==2.0.1
+jaraco.functools==4.0.1
+zipp==3.19.2
+backports.tarfile==1.2.0
+wheel==0.45.1
+platformdirs==4.2.2
+inflect==7.3.1
+typing_extensions==4.12.2
+jaraco.text==3.12.1
+typeguard==4.3.0
+importlib_metadata==8.0.0
+packaging==24.2
+jaraco.collections==5.1.0

tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,10 @@

+{"time":"2025-09-09T06:19:20.029854482Z","level":"INFO","msg":"stream: starting","core version":"0.21.0"}
+{"time":"2025-09-09T06:19:20.338868384Z","level":"INFO","msg":"stream: created new stream","id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"}
+{"time":"2025-09-09T06:19:20.338942945Z","level":"INFO","msg":"stream: started","id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"}
+{"time":"2025-09-09T06:19:20.338955936Z","level":"INFO","msg":"handler: started","stream_id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"}
+{"time":"2025-09-09T06:19:20.33900181Z","level":"INFO","msg":"writer: Do: started","stream_id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"}
+{"time":"2025-09-09T06:19:20.339014387Z","level":"INFO","msg":"sender: started","stream_id":"top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614"}
+{"time":"2025-09-09T16:55:51.461783187Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2025-09-09T17:52:23.968650788Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/zaydzuhri/fla/top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2025-09-09T22:51:18.011409168Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/zaydzuhri/fla/top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/file_stream\": dial tcp 35.186.228.49:443: connect: connection refused"}
+{"time":"2025-09-09T22:58:20.165767227Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/zaydzuhri/fla/top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/file_stream\": dial tcp 35.186.228.49:443: connect: connection refused"}

tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log ADDED Viewed

	@@ -0,0 +1,21 @@

+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_setup.py:_flush():80] Current SDK version is 0.21.0
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_setup.py:_flush():80] Configure stats pid to 795439
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from /home/cvm/.config/wandb/settings
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from /home/cvm/flame/wandb/settings
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_init.py:setup_run_log_directory():703] Logging user logs to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug.log
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to exp/top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20250909-0619/wandb/run-20250909_061919-top_transformer-top.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202509090614/logs/debug-internal.log
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_init.py:init():830] calling init triggers
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_init.py:init():835] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-09-09 06:19:19,818 INFO    MainThread:795439 [wandb_init.py:init():871] starting backend
+2025-09-09 06:19:20,025 INFO    MainThread:795439 [wandb_init.py:init():874] sending inform_init request
+2025-09-09 06:19:20,027 INFO    MainThread:795439 [wandb_init.py:init():882] backend started and connected
+2025-09-09 06:19:20,033 INFO    MainThread:795439 [wandb_init.py:init():953] updated telemetry
+2025-09-09 06:19:20,039 INFO    MainThread:795439 [wandb_init.py:init():977] communicating run to backend with 90.0 second timeout
+2025-09-09 06:19:20,682 INFO    MainThread:795439 [wandb_init.py:init():1029] starting run threads in backend
+2025-09-09 06:19:20,815 INFO    MainThread:795439 [wandb_run.py:_console_start():2458] atexit reg
+2025-09-09 06:19:20,815 INFO    MainThread:795439 [wandb_run.py:_redirect():2306] redirect: wrap_raw
+2025-09-09 06:19:20,815 INFO    MainThread:795439 [wandb_run.py:_redirect():2375] Wrapping output streams.
+2025-09-09 06:19:20,815 INFO    MainThread:795439 [wandb_run.py:_redirect():2398] Redirects installed.
+2025-09-09 06:19:20,817 INFO    MainThread:795439 [wandb_init.py:init():1075] run started, returning control to user process

torchtitan/components/__pycache__/dataloader.cpython-312.pyc ADDED Viewed

Binary file (3.79 kB). View file

torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc ADDED Viewed

Binary file (7.71 kB). View file

torchtitan/components/__pycache__/metrics.cpython-312.pyc ADDED Viewed

Binary file (19.6 kB). View file

torchtitan/components/__pycache__/tokenizer.cpython-312.pyc ADDED Viewed

Binary file (1.09 kB). View file

torchtitan/components/metrics.py ADDED Viewed

	@@ -0,0 +1,435 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import time
+from collections import namedtuple
+from datetime import datetime
+from typing import Any
+import torch
+from torch.utils.tensorboard import SummaryWriter
+from torchtitan.components.lr_scheduler import LRSchedulersContainer
+from torchtitan.components.optimizer import OptimizersContainer
+from torchtitan.config_manager import JobConfig
+from torchtitan.distributed import ParallelDims
+from torchtitan.tools import utils
+from torchtitan.tools.logging import logger
+from torchtitan.tools.utils import Color, device_module, device_type
+# named tuple for passing device memory stats for logging
+DeviceMemStats = namedtuple(
+    "DeviceMemStats",
+    [
+        "max_active_gib",
+        "max_active_pct",
+        "max_reserved_gib",
+        "max_reserved_pct",
+        "num_alloc_retries",
+        "num_ooms",
+    ],
+)
+class DeviceMemoryMonitor:
+    def __init__(self, device: str = f"{device_type}:0"):
+        self.device = torch.device(device)  # device object
+        self.device_name = device_module.get_device_name(self.device)
+        self.device_index = device_module.current_device()
+        self.device_capacity = device_module.get_device_properties(
+            self.device
+        ).total_memory
+        self.device_capacity_gib = self._to_gib(self.device_capacity)
+        device_module.reset_peak_memory_stats()
+        device_module.empty_cache()
+    def _to_gib(self, memory_in_bytes):
+        # NOTE: GiB (gibibyte) is 1024, vs GB is 1000
+        _gib_in_bytes = 1024 * 1024 * 1024
+        memory_in_gib = memory_in_bytes / _gib_in_bytes
+        return memory_in_gib
+    def _to_pct(self, memory):
+        return 100 * memory / self.device_capacity
+    def get_peak_stats(self):
+        device_info = device_module.memory_stats(self.device)
+        max_active = device_info.get("active_bytes.all.peak", -1)
+        max_active_gib = self._to_gib(max_active)
+        max_active_pct = self._to_pct(max_active)
+        max_reserved = device_info.get("reserved_bytes.all.peak", -1)
+        max_reserved_gib = self._to_gib(max_reserved)
+        max_reserved_pct = self._to_pct(max_reserved)
+        num_retries = device_info.get("num_alloc_retries", -1)
+        num_ooms = device_info.get("num_ooms", -1)
+        if num_retries > 0:
+            logger.warning(
+                f"{num_retries} {device_type.upper()} memory allocation retries."
+            )
+        if num_ooms > 0:
+            logger.warning(f"{num_ooms} {device_type.upper()} OOM errors thrown.")
+        return DeviceMemStats(
+            max_active_gib,
+            max_active_pct,
+            max_reserved_gib,
+            max_reserved_pct,
+            num_retries,
+            num_ooms,
+        )
+    def reset_peak_stats(self):
+        device_module.reset_peak_memory_stats()
+def build_device_memory_monitor():
+    device_memory_monitor = DeviceMemoryMonitor(device_type)
+    logger.info(
+        f"{device_type.upper()} capacity: {device_memory_monitor.device_name} "
+        f"with {device_memory_monitor.device_capacity_gib:.2f}GiB memory"
+    )
+    return device_memory_monitor
+class BaseLogger:
+    """Logger that does nothing, used when logging is disabled."""
+    def log(self, metrics: dict[str, Any], step: int) -> None:
+        pass
+    def close(self) -> None:
+        pass
+class TensorBoardLogger(BaseLogger):
+    """Logger implementation for TensorBoard."""
+    def __init__(self, log_dir: str, tag: str | None = None):
+        self.tag = tag
+        self.writer = SummaryWriter(log_dir, max_queue=1000)
+        logger.info(f"TensorBoard logging enabled. Logs will be saved at {log_dir}")
+    def log(self, metrics: dict[str, Any], step: int) -> None:
+        for k, v in metrics.items():
+            tag = k if self.tag is None else f"{self.tag}/{k}"
+            self.writer.add_scalar(tag, v, step)
+    def close(self) -> None:
+        self.writer.close()
+class WandBLogger(BaseLogger):
+    """Logger implementation for Weights & Biases."""
+    def __init__(self, log_dir: str, tag: str | None = None):
+        # Import wandb here to avoid startup import
+        import wandb
+        self.wandb = wandb
+        self.tag = tag
+        # Create logging directory
+        os.makedirs(log_dir, exist_ok=True)
+        self.wandb.init(
+            project=os.getenv("WANDB_PROJECT", "torchtitan"),
+            dir=log_dir,
+        )
+        logger.info("WandB logging enabled")
+    def log(self, metrics: dict[str, Any], step: int) -> None:
+        wandb_metrics = {
+            (k if self.tag is None else f"{self.tag}/{k}"): v
+            for k, v in metrics.items()
+        }
+        self.wandb.log(wandb_metrics, step=step)
+    def close(self) -> None:
+        if self.wandb.run is not None:
+            self.wandb.finish()
+def ensure_pp_loss_visible(
+    parallel_dims: ParallelDims, job_config: JobConfig, color: Color
+) -> None:
+    """
+    Ensures that the loss is visible on the console for pipeline-parallel training.
+    For pipeline-parallel training, the loss is only visible on the last pipeline stage.
+    This function checks if the appropriate rank is included in the LOG_RANK environment
+    variable and warns if it's not.
+    """
+    # V Block Schedules return loss on rank 0
+    if job_config.parallelism.pipeline_parallel_schedule == "ZBVZeroBubble":
+        return
+    # Calculate the rank where loss is visible (first rank of the last pipeline stage)
+    world_size = parallel_dims.world_size
+    pp_size = parallel_dims.pp
+    loss_visible_rank = (world_size // pp_size) * (pp_size - 1)
+    # Check if the loss-visible rank is included in LOG_RANK environment variable
+    env_logged_ranks = os.environ.get("LOG_RANK", "").split(",")
+    if env_logged_ranks == [""]:
+        env_logged_ranks = []
+    if str(loss_visible_rank) not in env_logged_ranks:
+        logger.warning(
+            f"{color.red}Pipeline Parallel loss is not visible. "
+            f"Please add {color.yellow}rank {loss_visible_rank}{color.red} "
+            f"to LOG_RANK environment variable in run_train.sh.{color.reset}"
+        )
+def _get_metrics_rank(
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+) -> int:
+    """
+    Determines which rank should log metrics.
+    Returns:
+       int: The rank responsible for logging metrics:
+            - Rank 0 for non-pipeline-parallel configs
+            - Rank 0 for pipeline-parallel 'ZBVZeroBubble' schedule
+            - The first rank of the last pipeline stage for other pipeline-parallel schedules
+    """
+    # Early return for non-pipeline-parallel configurations
+    if not parallel_dims.pp_enabled:
+        return 0
+    # V Block Schedules return loss on rank 0
+    if job_config.parallelism.pipeline_parallel_schedule == "ZBVZeroBubble":
+        return 0
+    # Calculate first rank of the last pipeline stage
+    world_size = parallel_dims.world_size
+    pp_size = parallel_dims.pp
+    return (world_size // pp_size) * (pp_size - 1)
+def _build_metric_logger(
+    job_config: JobConfig, parallel_dims: ParallelDims, tag: str | None = None
+) -> BaseLogger:
+    """
+    Build an appropriate metric logger based on configuration.
+    """
+    metrics_config = job_config.metrics
+    # Log initial config state
+    logger.debug(
+        f"Building logger with config: wandb={metrics_config.enable_wandb}, "
+        f"tensorboard={metrics_config.enable_tensorboard}"
+    )
+    # Check if any logging backend is enabled
+    has_logging_enabled = (
+        metrics_config.enable_tensorboard or metrics_config.enable_wandb
+    )
+    # Determine if this rank should log
+    should_log = has_logging_enabled
+    if (not metrics_config.save_for_all_ranks) and should_log:
+        metrics_rank = _get_metrics_rank(parallel_dims, job_config)
+        should_log = torch.distributed.get_rank() == metrics_rank
+    logger.debug(
+        f"Logging decision: has_logging_enabled={has_logging_enabled}, should_log={should_log}"
+    )
+    if not should_log:
+        logger.debug("Returning BaseLogger due to should_log=False")
+        return BaseLogger()
+    # Setup logging directory
+    dump_dir = job_config.job.dump_folder
+    base_log_dir = os.path.join(
+        dump_dir, metrics_config.save_tb_folder, datetime.now().strftime("%Y%m%d-%H%M")
+    )
+    if metrics_config.save_for_all_ranks:
+        base_log_dir = os.path.join(
+            base_log_dir, f"rank_{torch.distributed.get_rank()}"
+        )
+    # Create loggers in priority order
+    if metrics_config.enable_wandb:
+        logger.debug("Attempting to create WandB logger")
+        try:
+            return WandBLogger(base_log_dir, tag)
+        except Exception as e:
+            if "No module named 'wandb'" in str(e):
+                logger.error(
+                    "Failed to create WandB logger: No module named 'wandb'. Please install it using 'pip install wandb'."
+                )
+            else:
+                logger.error(f"Failed to create WandB logger: {e}")
+    if metrics_config.enable_tensorboard:
+        logger.debug("Creating TensorBoard logger")
+        return TensorBoardLogger(base_log_dir, tag)
+    logger.debug("No loggers enabled, returning BaseLogger")
+    return BaseLogger()
+class MetricsProcessor:
+    """Metrics processor to processes the metrics and log metrics.
+    The current MetricsProcessor log some metrics to STDOUT and some metrics to
+    TensorBoard or WandB.
+    Args:
+        job_config (JobConfig): Job configuration.
+        parallel_dims (ParallelDims): Parallel dimensions.
+        tag (Optional[str]): Tag to use for TensorBoard or WandB. Defaults to None.
+    """
+    logger: BaseLogger
+    parallel_dims: ParallelDims
+    job_config: JobConfig
+    device_memory_monitor: DeviceMemoryMonitor
+    color: utils.NoColor | utils.Color
+    gpu_peak_flops: int
+    ntokens_since_last_log: int
+    data_loading_times: list[float]
+    time_last_log: float
+    num_flops_per_token: int
+    optimizers: OptimizersContainer | None
+    lr_schedulers: LRSchedulersContainer | None
+    def __init__(
+        self,
+        job_config: JobConfig,
+        parallel_dims: ParallelDims,
+        tag: str | None = None,
+    ):
+        self.logger = _build_metric_logger(job_config, parallel_dims, tag)
+        self.parallel_dims = parallel_dims
+        self.job_config = job_config
+        self.device_memory_monitor = build_device_memory_monitor()
+        # used for colorful printing
+        self.color = (
+            utils.NoColor()
+            if job_config.metrics.disable_color_printing
+            else utils.Color()
+        )
+        self.gpu_peak_flops = utils.get_peak_flops(
+            self.device_memory_monitor.device_name
+        )
+        self.ntokens_since_last_log = 0
+        self.data_loading_times = []
+        self.time_last_log = time.perf_counter()
+        self.device_memory_monitor.reset_peak_stats()
+        # These variables have to be set later as they depend on other components or model.
+        self.num_flops_per_token = -1
+        self.optimizers = None
+        self.lr_schedulers = None
+    def should_log(self, step: int) -> bool:
+        return step == 1 or step % self.job_config.metrics.log_freq == 0
+    def log(
+        self,
+        step: int,
+        global_avg_loss: float,
+        global_max_loss: float,
+        extra_metrics: dict[str, Any] | None = None,
+    ):
+        assert self.num_flops_per_token > 0, "num_flops_per_token must be set"
+        time_delta = time.perf_counter() - self.time_last_log
+        # tokens per second per device, abbreviated as tps
+        tps = self.ntokens_since_last_log / (
+            time_delta * self.parallel_dims.non_data_parallel_size
+        )
+        # model FLOPS utilization
+        # For its definition and calculation, please refer to the PaLM paper:
+        # https://arxiv.org/abs/2204.02311
+        mfu = 100 * self.num_flops_per_token * tps / self.gpu_peak_flops
+        tflops = self.num_flops_per_token * tps / 1e12
+        time_end_to_end = time_delta / self.job_config.metrics.log_freq
+        time_data_loading = sum(self.data_loading_times) / len(self.data_loading_times)
+        time_data_loading_pct = 100 * sum(self.data_loading_times) / time_delta
+        device_mem_stats = self.device_memory_monitor.get_peak_stats()
+        metrics = {
+            "loss_metrics/global_avg_loss": global_avg_loss,
+            "loss_metrics/global_max_loss": global_max_loss,
+            "throughput(tps)": tps,
+            "tflops": tflops,
+            "mfu(%)": mfu,
+            "time_metrics/end_to_end(s)": time_end_to_end,
+            "time_metrics/data_loading(s)": time_data_loading,
+            "time_metrics/data_loading(%)": time_data_loading_pct,
+            "memory/max_active(GiB)": device_mem_stats.max_active_gib,
+            "memory/max_active(%)": device_mem_stats.max_active_pct,
+            "memory/max_reserved(GiB)": device_mem_stats.max_reserved_gib,
+            "memory/max_reserved(%)": device_mem_stats.max_reserved_pct,
+            "memory/num_alloc_retries": device_mem_stats.num_alloc_retries,
+            "memory/num_ooms": device_mem_stats.num_ooms,
+        }
+        if extra_metrics:
+            metrics.update(extra_metrics)
+        self.logger.log(metrics, step)
+        color = self.color
+        construct_string = str(
+            f"{color.red}step: {step:2}  "
+            f"{color.green}loss: {global_avg_loss:7.4f}  "
+            f"{color.yellow}memory: {device_mem_stats.max_reserved_gib:5.2f}GiB"
+            f"({device_mem_stats.max_reserved_pct:.2f}%)  "
+            f"{color.blue}tps: {round(tps):,}  "
+            f"{color.cyan}tflops: {tflops:,.2f}  "
+            f"{color.magenta}mfu: {mfu:.2f}%{color.reset}"
+        )
+        if extra_metrics:
+            for k, v in extra_metrics.items():
+                if "loss" in k:
+                    construct_string += f"  {color.white}{k.lstrip('loss_metrics/')}: {v:7.4f}"
+        logger.info(
+            construct_string
+        )
+        self.ntokens_since_last_log = 0
+        self.data_loading_times.clear()
+        self.time_last_log = time.perf_counter()
+        self.device_memory_monitor.reset_peak_stats()
+    def close(self):
+        self.logger.close()
+def build_metrics_processor(
+    job_config: JobConfig, parallel_dims: ParallelDims, tag: str | None = None
+) -> MetricsProcessor:
+    """Create a metrics processor.
+    Args:
+        job_config (JobConfig): Job configuration.
+        parallel_dims (ParallelDims): Parallel dimensions.
+        tag (Optional[str]): Tag to use for TensorBoard or WandB. Defaults to None.
+    Returns:
+        MetricsProcessor: A metrics processor.
+    """
+    return MetricsProcessor(job_config, parallel_dims, tag)

torchtitan/experiments/deepseek_v3/LICENSE-CODE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 DeepSeek
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

torchtitan/experiments/deepseek_v3/README.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# Running DeepSeek in Titan  (experimental)
+This folder contains a DeepSeek model supporting v2 and v3 as well as kernels
+and scripts needed to run it.
+## Inference
+### Prerequisites:
+You will need to download a DeepSeek model's weights if you want to run a
+pre-trained checkpoint.  We provided a script to download the weights from
+HuggingFace Model Hub:
+```bash
+python download.py [vX]
+```
+where `vX` can be v2 or v3, both are supported. You may be required to create a
+HuggingFace account and log in first.
+### Running inference:
+The inference script is in `generate.py`. You can run it with the following
+command:
+```bash
+torchrun --standalone --nproc-per-node 4 generate.py
+```
+This will run inference on the `DeepSeek-V2-Lite-Chat` model using 4 GPUs by
+default.
+Alternatively, you can run inference by using `bash inference.sh`, optionally
+followed by your prompt.
+## Training
+The training script is in `train.py`. You can run it by the following command:
+```bash
+torchrun --standalone --nproc-per-node 8 train.py
+```
+This will run training on the `DeepSeek-V2-Lite-Chat` model using 8 GPUs by
+default, with pipeline parallel, expert parallel, and data parallel enabled.

torchtitan/experiments/deepseek_v3/checkpoint.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import logging
+import os
+from typing import Dict, Optional, Set, Tuple
+import torch
+from safetensors import safe_open
+from transformers.utils import cached_file
+logger = logging.getLogger(__name__)
+_DEFAULT_SAFETENSOR_FILE_NAME = "model.safetensors.index.json"
+def read_weights_from_json(file_path: str) -> Optional[Dict[str, str]]:
+    try:
+        with open(file_path, "r") as file:
+            data = json.load(file)
+        if "weight_map" in data and isinstance(data["weight_map"], dict):
+            return data["weight_map"]
+        else:
+            logger.info("No 'weight_map' dictionary found in the JSON file.")
+            return None
+    except (json.JSONDecodeError, Exception) as e:
+        logger.info(f"An error occurred while reading the JSON file: {str(e)}")
+        return None
+def get_hf_weight_map_and_path(
+    model_id: str,
+) -> Tuple[Dict[str, str], str]:
+    """Get the weight map for a given HF model id and also the cache path for loading the weights"""
+    try:
+        index_file = cached_file(model_id, _DEFAULT_SAFETENSOR_FILE_NAME)
+    except Exception as e:
+        logger.error(
+            f"Model `{model_id}` not found in HF cache. "
+            f"You can download the model using `python download.py {model_id}"
+        )
+        raise e
+    weight_map = read_weights_from_json(index_file)
+    weight_path = os.path.dirname(index_file)
+    logger.info(f"Loading weights from: {weight_path}")
+    return weight_map, weight_path
+def get_needed_files(
+    state_dict: Dict[str, torch.Tensor], weight_map: Dict[str, str]
+) -> Set[str]:
+    needed_files = set()
+    for param in state_dict.keys():
+        file = weight_map.get(param)
+        if file:
+            needed_files.add(file)
+        elif param.endswith("weight"):
+            raise ValueError(
+                f"Parameter {param} not found in weight map, please check..."
+            )
+    logger.info(f"Needed files: {needed_files}")
+    return needed_files
+def load_safetensor_file(
+    full_path: str, device: torch.device
+) -> Dict[str, torch.Tensor]:
+    tensors = {}
+    with safe_open(full_path, framework="pt", device=device) as f:
+        for k in f.keys():
+            tensors[k] = f.get_tensor(k)
+    logger.info(f"Loaded {len(tensors)} tensors from {full_path}")
+    return tensors
+def load_safetensor_weights(
+    model: torch.nn.Module,
+    weight_map: Dict[str, str],
+    file_location: str,
+    device: torch.device,
+):
+    """
+    Load safetensor weights into a `nn.Module`.
+    Args:
+        model (Module): The PyTorch module to load weights into. It may be a
+        model chunk or a full model.
+        weight_map (Dict[str, str]): Mapping of model parameters to file names.
+        file_location (str): Directory containing the weight files.
+        device (torch.device): The device to load tensors onto.
+    """
+    model_state_dict = model.state_dict()
+    needed_files = get_needed_files(model_state_dict, weight_map)
+    updated_states: Set[str] = set()
+    for file in needed_files:
+        full_path = os.path.join(file_location, file)
+        try:
+            checkpoint = load_safetensor_file(full_path, "cpu")
+        except FileNotFoundError:
+            logger.error(f"File not found: {full_path}")
+        except Exception as e:
+            logger.error(f"Error during checkpoint processing of {full_path}: {str(e)}")
+        matched_keys = set(checkpoint.keys()) & set(model_state_dict.keys())
+        for key in matched_keys:
+            # Check shape
+            if model_state_dict[key].shape != checkpoint[key].shape:
+                raise ValueError(
+                    f"Shape mismatch for {key}: "
+                    f"model needs {model_state_dict[key].shape}, but "
+                    f"checkpoint has {checkpoint[key].shape}"
+                )
+            model_state_dict[key] = checkpoint[key].to(device)
+        updated_states.update(matched_keys)
+    missing_keys = set(model_state_dict.keys()) - updated_states
+    if missing_keys:
+        raise RuntimeError(
+            f"Partially updated state dict. Missing parameters: {missing_keys}"
+        )
+    model.load_state_dict(model_state_dict, strict=False, assign=True)
+    logger.info(f"Successfully loaded {len(updated_states)} weights into model")
+def load_weights_from_hf(
+    model: torch.nn.Module,
+    distribution: str,
+    device: torch.device,
+):
+    """
+    Load the weights from Hugging Face format (index file + multiple safetensor
+    files), and fill into `model`.  Model config is needed b/c we permute
+    wq and wk weights based on attn heads.
+    """
+    weight_map, weight_path = get_hf_weight_map_and_path(distribution)
+    load_safetensor_weights(
+        model,
+        weight_map,
+        weight_path,
+        device,
+    )

torchtitan/experiments/deepseek_v3/download.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Usage:
+# Downloads a given model to the HF Cache.  Pass in a listed option ala "v3" or your own custom model path.
+# python download.py {model_id} [custom_model_path]
+# Examples:
+# python download.py v2     # Use predefined model: deepseek-ai/DeepSeek-V2
+# python download.py custom "deepseek-ai/new-model"  # Download a custom model path
+# Available models:
+#   "v2-lite-chat": "deepseek-ai/DeepSeek-V2-Lite-Chat",
+#   "v2-lite": "deepseek-ai/DeepSeek-V2-Lite",
+#   "v2": "deepseek-ai/DeepSeek-V2",
+#   "v3": "deepseek-ai/deepseek-v3",
+#   "v3-0324": "deepseek-ai/DeepSeek-V3-0324",
+#   "custom": None,  # Placeholder for custom models
+import sys
+from transformers import AutoModelForCausalLM
+MODELS = {
+    "v2-lite-chat": "deepseek-ai/DeepSeek-V2-Lite-Chat",
+    "v2-lite": "deepseek-ai/DeepSeek-V2-Lite",
+    "v2": "deepseek-ai/DeepSeek-V2",
+    "v3": "deepseek-ai/deepseek-v3",
+    "v3-0324": "deepseek-ai/DeepSeek-V3-0324",
+    "custom": None,  # For custom (any) models
+}
+def print_usage():
+    print("Usage:")
+    print("  python download.py [model_version]")
+    print("  python download.py custom [custom_model_path]")
+    print("\nAvailable predefined models:")
+    for key, model in MODELS.items():
+        if key != "custom":  # Skip the custom placeholder
+            print(f"  {key}: {model}")
+    print("\nFor custom models:")
+    print("  custom: Specify your own model path")
+    print('  Example: python download.py custom "organization/model-name"')
+    sys.exit(1)
+# Process command line arguments
+if len(sys.argv) < 2 or sys.argv[1] not in MODELS:
+    print_usage()
+if sys.argv[1] == "custom":
+    if len(sys.argv) != 3:
+        print("Error: Custom model requires a model path")
+        print_usage()
+    model_id = sys.argv[2]
+    print(f"Using custom model: {model_id}")
+else:
+    model_id = MODELS[sys.argv[1]]
+print(f"Downloading model: {model_id}")
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    trust_remote_code=True,
+)

torchtitan/experiments/deepseek_v3/model.py ADDED Viewed

	@@ -0,0 +1,1325 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is based on model definition of `deepseek-ai/DeepSeek-V3-Base` on
+# Hugging Face Model Hub. Url:
+# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/modeling_deepseek.py
+# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/resolve/main/configuration_deepseek.py
+#
+# It has been modified from its original forms to accommodate naming convention
+# and usage patterns of the TorchTitan project.
+# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DeepSeek model."""
+import math
+from typing import Optional, Tuple
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from attn_mask_utils import _prepare_4d_causal_attention_mask
+from indices import generate_permute_indices
+from model_config import ModelArgs
+from symm_mem_recipes import OnDeviceAllToAllV
+from torch import nn
+from torch.distributed._functional_collectives import all_to_all_single_autograd
+from torchtitan.experiments.kernels.triton_mg_group_gemm.torchao_pr import (
+    ALIGN_SIZE_M,
+    grouped_gemm_forward,
+)
+# Get model parallel subgroup by name:
+# e.g. "pp", "ep", None
+def get_group(dim_name: Optional[str] = None) -> dist.ProcessGroup:
+    glob = torch.distributed.device_mesh._mesh_resources.get_current_mesh()
+    return glob.get_group(dim_name)
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings,
+            device=self.inv_freq.device,
+            dtype=torch.get_default_dtype(),
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+        freqs = torch.outer(t, self.inv_freq.to(t.device))
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+    ):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+        t = t / self.scaling_factor
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Deepseek
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+    ):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings)
+                - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (
+                base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Inverse dim formula to find dim based on number of rotations
+def yarn_find_correction_dim(
+    num_rotations, dim, base=10000, max_position_embeddings=2048
+):
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+# Find dim range bounds based on rotations
+def yarn_find_correction_range(
+    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+):
+    low = math.floor(
+        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    )
+    high = math.ceil(
+        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    )
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+def yarn_get_mscale(scale=1, mscale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+def yarn_linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+class YarnRotaryEmbedding(RotaryEmbedding):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        original_max_position_embeddings=4096,
+        beta_fast=32,
+        beta_slow=1,
+        mscale=1,
+        mscale_all_dim=0,
+    ):
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = mscale
+        self.mscale_all_dim = mscale_all_dim
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        dim = self.dim
+        freq_extra = 1.0 / (
+            self.base
+            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+        freq_inter = 1.0 / (
+            self.scaling_factor
+            * self.base
+            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+        low, high = yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            dim,
+            self.base,
+            self.original_max_position_embeddings,
+        )
+        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
+            device=device, dtype=torch.float32
+        )
+        inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(seq_len, device=device, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        _mscale = float(
+            yarn_get_mscale(self.scaling_factor, self.mscale)
+            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
+        )
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer(
+            "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False
+        )
+        self.register_buffer(
+            "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False
+        )
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    b, h, s, d = q.shape
+    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    b, h, s, d = k.shape
+    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class MLP(nn.Module):
+    act_fn = nn.SiLU()
+    def __init__(self, config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = (
+            config.intermediate_size if intermediate_size is None else intermediate_size
+        )
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class MoEGate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.scoring_func = config.scoring_func
+        self.seq_aux = config.seq_aux
+        self.topk_method = config.topk_method
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        # topk selection algorithm
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(
+            torch.empty((self.n_routed_experts, self.gating_dim))
+        )
+        if self.topk_method == "noaux_tc":
+            self.e_score_correction_bias = nn.Parameter(
+                # Changed from torch.empty to torch.rand to avoid non-even
+                # distribution for runs without actual weigths
+                torch.rand((self.n_routed_experts))
+            )
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        import torch.nn.init as init
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        # compute gating score
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(
+            hidden_states.type(torch.float32), self.weight.type(torch.float32), None
+        )
+        if self.scoring_func == "sigmoid":
+            scores = logits.sigmoid()
+        elif self.scoring_func == "softmax":
+            scores = logits.softmax(dim=-1, dtype=torch.float32)
+        else:
+            raise NotImplementedError(
+                f"insupportable scoring function for MoE gating: {self.scoring_func}"
+            )
+        # select top-k experts
+        if self.topk_method == "noaux_tc":
+            scores_for_choice = scores.view(
+                bsz * seq_len, -1
+            ) + self.e_score_correction_bias.unsqueeze(0)
+            group_scores = (
+                scores_for_choice.view(bsz * seq_len, self.n_group, -1)
+                .topk(2, dim=-1)[0]
+                .sum(dim=-1)
+            )  # [n, n_group]
+            group_idx = torch.topk(
+                group_scores, k=self.topk_group, dim=-1, sorted=False
+            )[
+                1
+            ]  # [n, top_k_group]
+            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+            score_mask = (
+                group_mask.unsqueeze(-1)
+                .expand(
+                    bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
+                )
+                .reshape(bsz * seq_len, -1)
+            )  # [n, e]
+            tmp_scores = scores_for_choice.masked_fill(
+                ~score_mask.bool(), 0.0
+            )  # [n, e]
+            _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
+            topk_weight = scores.gather(1, topk_idx)
+        elif self.topk_method == "greedy":
+            topk_weight, topk_idx = torch.topk(
+                scores, k=self.top_k, dim=-1, sorted=False
+            )
+        else:
+            raise NotImplementedError(
+                f"insupportable TopK function for MoE gating: {self.topk_method}"
+            )
+        # norm gate to sum 1
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+        topk_weight = (
+            topk_weight * self.routed_scaling_factor
+        )  # must multiply the scaling factor
+        return topk_idx, topk_weight
+class MoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+    # Class attributes:
+    # Two shuffle method supported:
+    # 1. "torch_all_to_all"
+    # 2. "symm_mem" (see `setup_symm_mem` below)
+    shuffle_method = "torch_all_to_all"
+    # Symmetric memory buffers shared by all MoE instances across layers
+    token_send_buf: Optional[torch.Tensor] = None
+    token_gather_buf: Optional[torch.Tensor] = None
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # ep_size is the number of ranks in expert dimension
+        if config.ep_size <= 1:
+            raise ValueError(
+                "For code simplicity, this model only supports distributed experts, "
+                "thus EP size must be > 1, please modify your model config"
+            )
+        self.ep_group = get_group("ep")
+        assert config.ep_size == self.ep_group.size()
+        self.ep_size = config.ep_size
+        self.ep_rank = self.ep_group.rank()
+        self.experts_per_rank = config.n_routed_experts // config.ep_size
+        # Use ModuleDict instead of ModuleList to preserve absoulte expert
+        # IDs while avoiding `None` experts. The absolute expert IDs match
+        # with checkpoint FQNs.
+        self.experts = nn.ModuleDict()
+        for i in range(self.experts_per_rank):
+            abs_expert_id = self.ep_rank * self.experts_per_rank + i
+            self.experts[str(abs_expert_id)] = MLP(
+                config, intermediate_size=config.moe_intermediate_size
+            )
+        self.gate = MoEGate(config)
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = MLP(
+                config=config, intermediate_size=intermediate_size
+            )
+    def combine_experts(self, submod_name):
+        all_weights = []
+        for expert in self.experts.values():
+            lin = expert.get_submodule(submod_name)
+            all_weights.append(lin.weight)
+            lin.weight = None
+        concat_weight = torch.cat(all_weights)
+        self.register_parameter(f"{submod_name}_weight", nn.Parameter(concat_weight))
+    # This function is used to create a symm mem buffer for MoE's. It is for
+    # shuffling tokens fully "on-device", as compared to traditional torch
+    # all_to_all APIs which requrie a GPU-to-CPU sync of the splits.  If a user
+    # calls this function, the `shuffle_method` would switch from
+    # `torch_all_to_all` to `symm_mem`.
+    def setup_symm_mem(self, dtype: torch.dtype, device: torch.device):
+        # Switch shuffle method
+        self.shuffle_method = "symm_mem"
+        # Combine expert weights
+        print("Combining expert weights for Group GEMM")
+        self.combine_experts("gate_proj")
+        self.combine_experts("up_proj")
+        self.combine_experts("down_proj")
+        # Assuming worst case, 2x tokens are routed to one EP rank
+        overflow = 2
+        OnDeviceAllToAllV.max_output_len = (
+            self.config.max_seq_len * self.num_experts_per_tok * overflow
+        )
+        # Symmetric memory buffers are shared by all MoE instances across
+        # layers, we only need to initialize them once
+        if MoE.token_send_buf is not None:
+            return
+        # Input buffer for DP-to-EP shuffle
+        MoE.token_send_buf = symm_mem.empty(
+            self.config.max_seq_len
+            * self.num_experts_per_tok,  # seq len * top k (flattened)
+            self.config.hidden_size,  # hidden dim
+            dtype=dtype,
+            device=device,
+        )
+        # Input buffer for EP-to-DP shuffle
+        MoE.token_gather_buf = symm_mem.empty(
+            self.config.max_seq_len
+            * self.num_experts_per_tok  # seq len * top k (flattened)
+            * overflow,
+            self.config.hidden_size,  # hidden dim
+            dtype=dtype,
+            device=device,
+        )
+        print(f"EP rank [{self.ep_rank}]: Created Symmetric Memory for MoE")
+    def get_send_buf(self):
+        # [Why detach?] During a first forward-backward step, the buffer would
+        # be included in a computational graph. In a second step, autograd will
+        # return an error saying "Trying to backward through the graph a second
+        # time (or directly access saved tensors more than once)". This is
+        # because the buffer is still in the graph, and autograd is trying to
+        # backward through the graph a second time. To avoid this, we detach the
+        # buffer from the graph. `detach()` returns a new tensor, which shares
+        # the same storage with the original one.
+        self.token_send_buf.grad = None
+        return self.token_send_buf.detach()
+    def get_gather_buf(self):
+        # See [Why detach?] in `get_send_buf`
+        self.token_gather_buf.grad = None
+        return self.token_gather_buf.detach()
+    def forward(self, hidden_states):
+        identity = hidden_states
+        orig_shape = hidden_states.shape
+        # for each token, select top-k experts, and compute the weight for each expert
+        topk_idx, topk_weight = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        if self.shuffle_method == "symm_mem":
+            y = self.moe_on_device(hidden_states, topk_idx, topk_weight)
+        else:  # "torch_all_to_all"
+            y = self.moe_forward(hidden_states, topk_idx, topk_weight)
+        y = y.view(*orig_shape)
+        if self.config.n_shared_experts is not None:
+            y = y + self.shared_experts(identity)
+        return y
+    def moe_forward(self, x, topk_ids, topk_weight):
+        # This part sorts the token indices so that tokens routed to the same expert reside consecutively.
+        # An implication is that tokens to the same "expert group" (i.e., device) are also consecutive.
+        # Since this is an "aritificial" index creation (final outcome being
+        # `idxs`), we don't need gradients here.
+        with torch.no_grad():
+            # [seq_len, n_routed_experts]
+            cnts = topk_ids.new_zeros((topk_ids.shape[0], self.config.n_routed_experts))
+            # Fill 1 to the selected experts
+            cnts.scatter_(1, topk_ids, 1)
+            tokens_per_expert = cnts.sum(dim=0)
+            # Token indices for each expert
+            idxs = topk_ids.view(-1).argsort()
+            sorted_tokens_shape = idxs.shape + x.shape[1:]
+        sorted_tokens = x[idxs // topk_ids.shape[1]]
+        assert sorted_tokens.shape == sorted_tokens_shape
+        # This part exchange the information about the number of tokens send and
+        # received by each expert. We can understand this information as "side
+        # band", which is not part of the actual data. Thus no gradient is
+        # needed.
+        with torch.no_grad():
+            # Sum the tokens over local experts, then we get tokens per EP rank,
+            # which is the input splits
+            tokens_per_expert_group = tokens_per_expert.new_empty(
+                tokens_per_expert.shape[0]
+            )
+            dist.all_to_all_single(
+                tokens_per_expert_group, tokens_per_expert, group=self.ep_group
+            )
+            input_splits = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
+        # DP to EP token shuffle. This part needs gradient.
+        if self.shuffle_method == "symm_mem":
+            # Move input to the `token_send_buf` symm mem
+            token_send_buf = self.get_send_buf()
+            token_send_buf[: idxs.shape[0]].copy_(sorted_tokens)
+            # Note: `out=` avoids copy, but it is not differentiable
+            # torch.index_select(x, 0, idxs // topk_ids.shape[1], out=self.token_send_buf[: idxs.shape[0]])
+            token_gather_buf, output_splits = OnDeviceAllToAllV.apply(
+                token_send_buf,
+                input_splits,
+                self.ep_group,
+            )
+            with torch.no_grad():
+                # Received tokens from all other ranks. TODO: use mask instead
+                received = output_splits.sum()
+            # TODO: don't use `received`
+            gathered_tokens = token_gather_buf[:received]
+        else:  # "torch_all_to_all"
+            # Prepare input ans output splits
+            with torch.no_grad():
+                output_splits = tokens_per_expert_group.view(self.ep_size, -1).sum(
+                    dim=1
+                )
+            gathered_tokens = all_to_all_single_autograd(
+                sorted_tokens,
+                output_splits.tolist(),
+                input_splits.tolist(),
+                self.ep_group,
+            )
+        # This part prepares a 1D tensor with the same length as
+        # `gathered_tokens`. The 1D tensor is filled with local expert IDs which
+        # the tokens in `gathered_tokens` are headed for. This part doesn't need
+        # gradient.
+        with torch.no_grad():
+            gatherd_idxs = (
+                torch.arange(
+                    tokens_per_expert_group.numel(),
+                    device=tokens_per_expert_group.device,
+                )
+                % self.experts_per_rank
+            )
+            gatherd_idxs = gatherd_idxs.repeat_interleave(tokens_per_expert_group)
+        # Prepare buffer for tokens processed by experts
+        if self.shuffle_method == "symm_mem":
+            # Take necessary space from `token_gather_buf` symm mem because we are
+            # going to send them out after expert processing
+            processed_tokens = self.get_gather_buf()[: gathered_tokens.shape[0]]
+        else:  # "torch_all_to_all"
+            processed_tokens = torch.empty_like(gathered_tokens)
+        # This part processes the tokens routed to the local experts.
+        # TODO: can we use group GEMM here?
+        for i, expert in enumerate(self.experts.values()):
+            processed_tokens[gatherd_idxs == i] = expert(
+                gathered_tokens[gatherd_idxs == i]
+            )
+        # Now shuffle the tokens back to their original owner, i.e. EP to DP shuffle.
+        # The input/output splits are just a reverse of the previous shuffle.
+        if self.shuffle_method == "symm_mem":
+            token_return_buf, _ = OnDeviceAllToAllV.apply(
+                processed_tokens,
+                output_splits,
+                self.ep_group,
+            )
+            returned_tokens = token_return_buf[: sorted_tokens_shape[0]]
+        else:  # "torch_all_to_all"
+            returned_tokens = all_to_all_single_autograd(
+                processed_tokens,
+                input_splits.tolist(),
+                output_splits.tolist(),
+                self.ep_group,
+            )
+        output_tokens = torch.empty_like(returned_tokens)
+        output_tokens[idxs] = returned_tokens
+        final_out = (
+            output_tokens.view(*topk_ids.shape, -1)
+            .type(topk_weight.dtype)
+            .mul_(topk_weight.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .type(returned_tokens.dtype)
+        )
+        return final_out
+    def moe_on_device(self, x, topk_ids, topk_weight):
+        # This part sorts the token indices so that tokens routed to the same expert reside consecutively.
+        # An implication is that tokens to the same "expert group" (i.e., device) are also consecutive.
+        # Since this is an "aritificial" index creation (final outcome being
+        # `idxs`), we don't need gradients here.
+        with torch.no_grad():
+            # [seq_len, n_routed_experts]
+            cnts = topk_ids.new_zeros((topk_ids.shape[0], self.config.n_routed_experts))
+            # Fill 1 to the selected experts
+            cnts.scatter_(1, topk_ids, 1)
+            tokens_per_expert = cnts.sum(dim=0)
+            # Token indices for each expert
+            idxs = topk_ids.view(-1).argsort()
+            sorted_tokens_shape = idxs.shape + x.shape[1:]
+        sorted_tokens = x[idxs // topk_ids.shape[1]]
+        assert sorted_tokens.shape == sorted_tokens_shape
+        # This part exchange the information about the number of tokens send and
+        # received by each expert. We can understand this information as "side
+        # band", which is not part of the actual data. Thus no gradient is
+        # needed.
+        with torch.no_grad():
+            # Sum the tokens over local experts, then we get tokens per EP rank,
+            # which is the input splits
+            tokens_per_expert_group = tokens_per_expert.new_empty(
+                tokens_per_expert.shape[0]
+            )
+            dist.all_to_all_single(
+                tokens_per_expert_group, tokens_per_expert, group=self.ep_group
+            )
+            input_splits = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
+        # Move input to the `token_send_buf` symm mem
+        token_send_buf = self.get_send_buf()
+        token_send_buf[: idxs.shape[0]].copy_(sorted_tokens)
+        # Note: `out=` avoids copy, but it is not differentiable
+        # torch.index_select(x, 0, idxs // topk_ids.shape[1], out=self.token_send_buf[: idxs.shape[0]])
+        token_gather_buf, output_splits = OnDeviceAllToAllV.apply(
+            token_send_buf,
+            input_splits,
+            self.ep_group,
+        )
+        # We need to permute the received tokens so that tokens for the same expert are contiguous.
+        # This part prepares a 1D tensor `permuted_indices` for such permutation.
+        # This part doesn't need gradient.
+        with torch.no_grad():
+            permuted_indices, m_sizes = generate_permute_indices(
+                tokens_per_expert_group,
+                self.experts_per_rank,
+                self.ep_size,
+                token_gather_buf.shape[0],
+                ALIGN_SIZE_M,
+            )
+        # Permute the received tokens so that tokens for the same expert are contiguous.
+        contig_tokens = token_gather_buf[permuted_indices]
+        # Run the first grouped GEMM
+        w1 = self.get_parameter("gate_proj_weight")
+        gate_proj = grouped_gemm_forward(contig_tokens, w1, m_sizes)
+        # Run the second grouped GEMM
+        w3 = self.get_parameter("up_proj_weight")
+        up_proj = grouped_gemm_forward(contig_tokens, w3, m_sizes)
+        # Apply activation
+        hidden_outputs = MLP.act_fn(gate_proj) * up_proj
+        # Run the third grouped GEMM
+        w2 = self.get_parameter("down_proj_weight")
+        hidden_outputs = grouped_gemm_forward(hidden_outputs, w2, m_sizes)
+        # Prepare buffer for tokens processed by experts
+        # Take necessary space from `token_gather_buf` symm mem because we are
+        # going to send them out after expert processing
+        processed_tokens = self.get_gather_buf()
+        # Move into Symmetric Memory for the return shuffle
+        processed_tokens[permuted_indices] = hidden_outputs
+        # Now shuffle the tokens back to their original owner, i.e. EP to DP shuffle.
+        # The input/output splits are just a reverse of the previous shuffle.
+        token_return_buf, _ = OnDeviceAllToAllV.apply(
+            processed_tokens,
+            output_splits,
+            self.ep_group,
+        )
+        returned_tokens = token_return_buf[: sorted_tokens_shape[0]]
+        output_tokens = torch.empty_like(returned_tokens)
+        output_tokens[idxs] = returned_tokens
+        final_out = (
+            output_tokens.view(*topk_ids.shape, -1)
+            .type(topk_weight.dtype)
+            .mul_(topk_weight.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .type(returned_tokens.dtype)
+        )
+        return final_out
+class Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: ModelArgs, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.kv_lora_rank = config.kv_lora_rank
+        self.v_head_dim = config.v_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
+        self.is_causal = True
+        if self.q_lora_rank is None:
+            self.q_proj = nn.Linear(
+                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
+            )
+        else:
+            self.q_a_proj = nn.Linear(
+                self.hidden_size, config.q_lora_rank, bias=config.attention_bias
+            )
+            self.q_a_layernorm = RMSNorm(config.q_lora_rank)
+            self.q_b_proj = nn.Linear(
+                config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
+            )
+        self.kv_a_proj_with_mqa = nn.Linear(
+            self.hidden_size,
+            config.kv_lora_rank + config.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        self.kv_a_layernorm = RMSNorm(config.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            config.kv_lora_rank,
+            self.num_heads
+            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=config.attention_bias,
+        )
+        self._init_rope()
+        self.softmax_scale = self.q_head_dim ** (-0.5)
+        if self.config.rope_scaling is not None:
+            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+            scaling_factor = self.config.rope_scaling["factor"]
+            if mscale_all_dim:
+                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+                self.softmax_scale = self.softmax_scale * mscale * mscale
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = RotaryEmbedding(
+                self.qk_rope_head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LinearScalingRotaryEmbedding(
+                    self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                    self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "yarn":
+                kwargs = {
+                    key: self.config.rope_scaling[key]
+                    for key in [
+                        "original_max_position_embeddings",
+                        "beta_fast",
+                        "beta_slow",
+                        "mscale",
+                        "mscale_all_dim",
+                    ]
+                    if key in self.config.rope_scaling
+                }
+                self.rotary_emb = YarnRotaryEmbedding(
+                    self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                    **kwargs,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+        q_nope, q_pe = torch.split(
+            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, k_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+        kv = (
+            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            .transpose(1, 2)
+        )
+        k_nope, value_states = torch.split(
+            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
+        )
+        kv_seq_len = value_states.shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
+        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+        if attention_mask is not None:
+            # Attention mask was made 4D because the `attn_weights` above is 4D.
+            # We probably can make this mask smarter if we want to pack sequences
+            # together, instead of using padding. This optimization can be used in
+            # inference. For training, if we want to pack sequences, data loader
+            # will pass in a mask containing such info.
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,  # None, or user provided mask in 2D
+                (bsz, q_len),
+                hidden_states,
+                0,  # past_key_values_length, 0 when training
+            )
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query=query_states,
+            key=key_states,
+            value=value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout,
+            is_causal=attention_mask is None,
+            scale=self.softmax_scale,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class DecoderLayer(nn.Module):
+    def __init__(self, config: ModelArgs, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Attention(config=config, layer_idx=layer_idx)
+        self.mlp = (
+            MoE(config)
+            if (
+                config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0
+            )
+            else MLP(config)
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+Deepseek_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class DeepseekModel(torch.nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DecoderLayer`]
+    Args:
+        config: ModelArgs
+    """
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        # Creating model parts related to my stage
+        assert (
+            config.stage_idx < config.num_stages
+        ), f"Stage {config.stage_idx} is not in the model"
+        print(f"Creating model stage {config.stage_idx} of {config.num_stages}")
+        self.embed_tokens = (
+            nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+            if config.stage_idx == 0
+            else None
+        )
+        self.layers = torch.nn.ModuleDict()
+        division = config.num_hidden_layers // config.num_stages
+        residual = config.num_hidden_layers % config.num_stages
+        # Some earlier stages may have 1 more layer than latter stages because
+        # the division may have residual; this is more even than giving the
+        # entire residual to the last stage.
+        layers_per_stage = [
+            division + 1 if stage < residual else division
+            for stage in range(config.num_stages)
+        ]
+        assert sum(layers_per_stage) == config.num_hidden_layers
+        layer_id_start = sum(layers_per_stage[: config.stage_idx])
+        layer_id_end = layer_id_start + layers_per_stage[config.stage_idx]
+        for layer_id in range(layer_id_start, layer_id_end):
+            self.layers[str(layer_id)] = DecoderLayer(config, layer_id)
+        self.norm = (
+            RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            if config.stage_idx == config.num_stages - 1
+            else None
+        )
+        # Initialize weights and apply final processing
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        # Embedding
+        hidden_states = (
+            self.embed_tokens(tokens) if self.embed_tokens is not None else tokens
+        )
+        # decoder layers
+        for decoder_layer in self.layers.values():
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+            )
+        hidden_states = (
+            self.norm(hidden_states) if self.norm is not None else hidden_states
+        )
+        return hidden_states
+class DeepseekForCausalLM(torch.nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.model = DeepseekModel(config)
+        self.lm_head = (
+            nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+            if config.stage_idx == config.num_stages - 1
+            else None
+        )
+        # Initialize weights and apply final processing
+        # self.post_init()
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Tuple:
+        r"""
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, DeepseekForCausalLM
+        >>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        hidden_states = self.model(
+            tokens,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        logits = (
+            self.lm_head(hidden_states) if self.lm_head is not None else hidden_states
+        )
+        return logits
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        if past_key_values is not None:
+            # Assuming isinstance(past_key_values, Cache):
+            cache_length = past_key_values.get_seq_length()
+            past_length = past_key_values.seen_tokens
+            max_cache_length = past_key_values.get_max_length()
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if (
+                attention_mask is not None
+                and attention_mask.shape[1] > input_ids.shape[1]
+            ):
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx.to(past_state.device))
+                    for past_state in layer_past
+                ),
+            )
+        return reordered_past
+    # Setup Symmetric Memory for MoE token shuffle.
+    # Supports inference currently.
+    def setup_symm_mem(self, dtype: torch.dtype, device: torch.device):
+        for layer in self.model.layers.values():
+            if not isinstance(layer.mlp, MoE):
+                continue
+            layer.mlp.setup_symm_mem(dtype, device)

torchtitan/experiments/deepseek_v3/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers
+accelerate
+torchdata >= 0.8.0
+datasets >= 2.21.0
+tomli >= 1.1.0 ; python_version < "3.11"

torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_utils.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import triton
+import triton.language as tl
+@triton.jit
+def get_tid():
+    return tl.inline_asm_elementwise(
+        """
+        mov.u32 $0, %tid.x;
+        mov.u32 $1, %tid.y;
+        mov.u32 $2, %tid.z;
+        """,
+        "=r,=r,=r",
+        [],
+        dtype=(tl.uint32, tl.uint32, tl.uint32),
+        is_pure=True,
+        pack=1,
+    )
+@triton.jit
+def get_ntid():
+    return tl.inline_asm_elementwise(
+        """
+        mov.u32 $0, %ntid.x;
+        mov.u32 $1, %ntid.y;
+        mov.u32 $2, %ntid.z;
+        """,
+        "=r,=r,=r",
+        [],
+        dtype=(tl.uint32, tl.uint32, tl.uint32),
+        is_pure=True,
+        pack=1,
+    )
+@triton.jit
+def get_flat_tid():
+    tid_x, tid_y, tid_z = get_tid()
+    ntid_x, ntid_y, _ = get_ntid()
+    return tid_z * ntid_y * ntid_x + tid_y * ntid_x + tid_x
+@triton.jit
+def get_flat_bid():
+    return (
+        tl.program_id(2) * tl.num_programs(1) * tl.num_programs(0)
+        + tl.program_id(1) * tl.num_programs(0)
+        + tl.program_id(0)
+    )
+@triton.jit
+def sync_threads():
+    tl.inline_asm_elementwise(
+        "bar.sync 0;", "=r", [], dtype=tl.int32, is_pure=False, pack=1
+    )

torchtitan/experiments/flux/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# FLUX model in torchtitan
+## Overview
+## Usage
+First, download the autoencoder model from HuggingFace with your own access token:
+```bash
+python torchtitan/experiments/flux/scripts/download_autoencoder.py --repo_id black-forest-labs/FLUX.1-dev --ae_path ae.safetensors --hf_token <your_access_token>
+```
+This step will download the autoencoder model from HuggingFace and save it to the `torchtitan/experiments/flux/assets/autoencoder/ae.safetensors` file.
+Run the following command to train the model on a single GPU:
+```bash
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True torchrun --nproc_per_node=1 torchtitan/experiments/flux/train.py --job.config_file torchtitan/experiments/flux/train_configs/debug_model.toml
+```
+## TODO
+- [ ] Supporting for multiple GPUs is comming soon (FSDP, etc)
+- [ ] Implement test cases in CI for FLUX model. Adding more unit tests for FLUX model (eg, unit test for preprocessor, etc)
+- [ ] More parallesim support (Tensor Parallelism, Context Parallelism, etc)
+- [ ] Support for distributed checkpointing and loading
+- [ ] Implement init_weights() function to initialize the model weights
+- [ ] Implement the num_flops_per_token calculation in get_nparams_and_flops() function

torchtitan/experiments/flux/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (2.08 kB). View file

torchtitan/experiments/flux/dataset/flux_dataset.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import random
+from dataclasses import dataclass
+from typing import Any, Callable, Optional
+import numpy as np
+import torch
+from datasets import Dataset, load_dataset
+from datasets.distributed import split_dataset_by_node
+from PIL import Image
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.utils.data import IterableDataset
+from torchtitan.components.dataloader import ParallelAwareDataloader
+from torchtitan.config_manager import JobConfig
+from torchtitan.experiments.flux.dataset.tokenizer import FluxTokenizer
+from torchtitan.tools.logging import logger
+def _process_cc12m_image(
+    img: Image.Image,
+    output_size: int = 256,
+) -> Optional[torch.Tensor]:
+    """Process CC12M image to the desired size."""
+    width, height = img.size
+    # Skip low resolution images
+    if width < output_size or height < output_size:
+        return None
+    if width >= height:
+        # resize height to be equal to output_size, then crop
+        new_width, new_height = math.ceil(output_size / height * width), output_size
+        img = img.resize((new_width, new_height))
+        left = random.randint(0, new_width - output_size)
+        resized_img = img.crop((left, 0, left + output_size, output_size))
+    else:
+        # resize width to be equal to output_size, the crop
+        new_width, new_height = (
+            output_size,
+            math.ceil(output_size / width * height),
+        )
+        img = img.resize((new_width, new_height))
+        lower = random.randint(0, new_width - output_size)
+        resized_img = img.crop((0, lower, output_size, lower + output_size))
+    assert resized_img.size[0] == resized_img.size[1] == output_size
+    # Skip grayscale images
+    if resized_img.mode == "L":
+        return None
+    np_img = np.array(resized_img).transpose((2, 0, 1))
+    tensor_img = torch.tensor(np_img).float() / 255.0
+    # NOTE: The following commented code is an alternative way
+    # img_transform = transforms.Compose(
+    #     [
+    #         transforms.Resize(max(output_size, output_size)),
+    #         transforms.CenterCrop((output_size, output_size)),
+    #         transforms.ToTensor(),
+    #     ]
+    # )
+    # tensor_img = img_transform(img)
+    return tensor_img
+def _flux_data_processor(
+    sample: dict[str, Any],
+    t5_tokenizer: FluxTokenizer,
+    clip_tokenizer: FluxTokenizer,
+    output_size: int = 256,
+) -> dict[str, Any]:
+    """
+    Preprocess CC12M dataset sample image and text for Flux model.
+    Args:
+        sample: A sample from dataset
+        t5_encoder: T5 encoder
+        clip_encoder: CLIP encoder
+        output_size: The output image size
+    """
+    img = _process_cc12m_image(sample["jpg"], output_size=output_size)
+    t5_tokens = t5_tokenizer.encode(sample["txt"])
+    clip_tokens = clip_tokenizer.encode(sample["txt"])
+    return {
+        "image": img,
+        "clip_tokens": clip_tokens,  # type: List[int]
+        "t5_tokens": t5_tokens,  # type: List[int]
+    }
+@dataclass
+class TextToImageDatasetConfig:
+    path: str
+    loader: Callable
+    data_processor: Callable
+DATASETS = {
+    "cc12m": TextToImageDatasetConfig(
+        path="pixparse/cc12m-wds",
+        loader=lambda path: load_dataset(path, split="train", streaming=True),
+        data_processor=_flux_data_processor,
+    ),
+}
+def _validate_dataset(
+    dataset_name: str, dataset_path: Optional[str] = None
+) -> tuple[str, Callable, Callable]:
+    """Validate dataset name and path."""
+    if dataset_name not in DATASETS:
+        raise ValueError(
+            f"Dataset {dataset_name} is not supported. "
+            f"Supported datasets are: {list(DATASETS.keys())}"
+        )
+    config = DATASETS[dataset_name]
+    path = dataset_path or config.path
+    logger.info(f"Preparing {dataset_name} dataset from {path}")
+    return path, config.loader, config.data_processor
+class FluxDataset(IterableDataset, Stateful):
+    """Dataset for FLUX text-to-image model.
+    Args:
+    dataset_name (str): Name of the dataset.
+    dataset_path (str): Path to the dataset.
+    model_transform (Transform): Callable that applies model-specific preprocessing to the sample.
+    dp_rank (int): Data parallel rank.
+    dp_world_size (int): Data parallel world size.
+    infinite (bool): Whether to loop over the dataset infinitely.
+    """
+    def __init__(
+        self,
+        dataset_name: str,
+        dataset_path: Optional[str],
+        t5_tokenizer: FluxTokenizer,
+        clip_tokenizer: FluxTokenizer,
+        job_config: Optional[JobConfig] = None,
+        dp_rank: int = 0,
+        dp_world_size: int = 1,
+        infinite: bool = False,
+    ) -> None:
+        # Force lowercase for consistent comparison
+        dataset_name = dataset_name.lower()
+        path, dataset_loader, data_processor = _validate_dataset(
+            dataset_name, dataset_path
+        )
+        ds = dataset_loader(path)
+        self.dataset_name = dataset_name
+        self._data = split_dataset_by_node(ds, dp_rank, dp_world_size)
+        self._t5_tokenizer = t5_tokenizer
+        self._clip_tokenizer = clip_tokenizer
+        self._data_processor = data_processor
+        self.job_config = job_config
+        self.infinite = infinite
+        # Variables for checkpointing
+        self._sample_idx = 0
+        self._all_samples: list[dict[str, Any]] = []
+    def _get_data_iter(self):
+        if isinstance(self._data, Dataset) and self._sample_idx == len(self._data):
+            return iter([])
+        it = iter(self._data)
+        for _ in range(self._sample_idx):
+            next(it)
+        return it
+    def __iter__(self):
+        while True:
+            for sample in self._get_data_iter():
+                # Use the dataset-specific preprocessor
+                sample_dict = self._data_processor(
+                    sample, self._t5_tokenizer, self._clip_tokenizer, output_size=256
+                )
+                # skip low quality image or image with color channel = 1
+                if sample_dict["image"] is None:
+                    logger.warning(
+                        f"Low quality image {sample['__key__']} is skipped in Flux Dataloader"
+                    )
+                    continue
+                self._all_samples.extend(sample_dict)
+                self._sample_idx += 1
+                labels = sample_dict.pop("image")
+                yield sample_dict, labels
+            if not self.infinite:
+                logger.warning(f"Dataset {self.dataset_name} has run out of data")
+                break
+            else:
+                # Reset offset for the next iteration
+                self._sample_idx = 0
+                logger.warning(f"Dataset {self.dataset_name} is being re-looped")
+    def load_state_dict(self, state_dict):
+        self._sample_idx = state_dict["sample_idx"]
+        self._all_samples = state_dict["all_samples"]
+    def state_dict(self):
+        return {
+            "all_samples": self._all_samples,
+            "sample_idx": self._sample_idx,
+        }
+def build_flux_dataloader(
+    dp_world_size: int,
+    dp_rank: int,
+    job_config: JobConfig,
+    # This parameter is not used, keep it for compatibility
+    tokenizer: FluxTokenizer | None,
+    infinite: bool = True,
+) -> ParallelAwareDataloader:
+    """Build a data loader for HuggingFace datasets."""
+    dataset_name = job_config.training.dataset
+    dataset_path = job_config.training.dataset_path
+    batch_size = job_config.training.batch_size
+    t5_encoder_name = job_config.encoder.t5_encoder
+    clip_encoder_name = job_config.encoder.clip_encoder
+    max_t5_encoding_len = job_config.encoder.max_t5_encoding_len
+    ds = FluxDataset(
+        dataset_name=dataset_name,
+        dataset_path=dataset_path,
+        t5_tokenizer=FluxTokenizer(t5_encoder_name, max_length=max_t5_encoding_len),
+        clip_tokenizer=FluxTokenizer(
+            clip_encoder_name, max_length=77
+        ),  # fix max_length for CLIP
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        infinite=infinite,
+    )
+    return ParallelAwareDataloader(
+        dataset=ds,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        batch_size=batch_size,
+    )

torchtitan/experiments/flux/dataset/tokenizer.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+from typing import List
+from torchtitan.components.tokenizer import Tokenizer
+from transformers import CLIPTokenizer, T5Tokenizer
+class FluxTokenizer(Tokenizer):
+    """
+    Tokenizing and encoding/decoding text using the T5 or Clip tokenizer.
+    Args:
+        model_path (str): Path to the tokenzier from hugging face.
+    """
+    def __init__(self, model_path: str = "t5-small", max_length: int = 77):
+        super().__init__()
+        self._n_words = 8  # TODO(jianiw): check
+        self._max_length = max_length
+        self.is_clip = model_path.startswith("openai")
+        if self.is_clip:
+            self._tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
+                model_path, max_length=max_length
+            )
+        else:
+            self._tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(
+                model_path, max_length=max_length
+            )
+    def encode(
+        self,
+        s: str,
+    ) -> List[int]:
+        """
+        Encode the prompt text into tokens.
+        """
+        tokens = self._tokenizer(
+            s,
+            truncation=True,
+            max_length=self._max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",  # return pytorch tensors, default return List[int]
+        )["input_ids"]
+        return tokens
+    def decode(self, t: List[int]) -> str:
+        """
+        Decode function. This function will not be called.
+        """
+        return self._tokenizer.decode(t)

torchtitan/experiments/flux/model/__pycache__/layers.cpython-312.pyc ADDED Viewed

Binary file (17.7 kB). View file

torchtitan/experiments/flux/model/hf_embedder.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torch import nn, Tensor
+from transformers import CLIPTextModel, T5EncoderModel
+class FluxEmbedder(nn.Module):
+    def __init__(self, version: str, **hf_kwargs):
+        super().__init__()
+        self.is_clip = version.startswith("openai")
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        if self.is_clip:
+            self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(
+                version, **hf_kwargs
+            )
+        else:
+            self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
+                version, **hf_kwargs
+            )
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+    def forward(self, batch_tokens: Tensor) -> Tensor:
+        """
+        batch_tokens: [bsz, embedding_length]
+        For T5 Encoder, embeding_length is 768
+        For CLIP, embedding_length is 256
+        """
+        outputs = self.hf_module(
+            input_ids=batch_tokens.to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        return outputs[self.output_key]

torchtitan/experiments/flux/model/model.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass, field
+import torch
+from torch import nn, Tensor
+from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.config_manager import JobConfig
+from torchtitan.experiments.flux.model.autoencoder import AutoEncoderParams
+from torchtitan.experiments.flux.model.layers import (
+    DoubleStreamBlock,
+    EmbedND,
+    LastLayer,
+    MLPEmbedder,
+    SingleStreamBlock,
+    timestep_embedding,
+)
+from torchtitan.protocols.train_spec import BaseModelArgs, ModelProtocol
+from torchtitan.tools.logging import logger
+@dataclass
+class FluxModelArgs(BaseModelArgs):
+    in_channels: int = 64
+    out_channels: int = 64
+    vec_in_dim: int = 768
+    context_in_dim: int = 512
+    hidden_size: int = 3072
+    mlp_ratio: float = 4.0
+    num_heads: int = 24
+    depth: int = 19
+    depth_single_blocks: int = 38
+    axes_dim: tuple = (16, 56, 56)
+    theta: int = 10_000
+    qkv_bias: bool = True
+    guidance_embed: bool = True
+    autoencoder_params: AutoEncoderParams = field(default_factory=AutoEncoderParams)
+    def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
+        # context_in_dim is the same as the T5 embedding dimension
+        self.context_in_dim = job_config.encoder.max_t5_encoding_len
+    def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
+        # TODO(jianiw): Add the number of flops for the autoencoder
+        nparams = sum(p.numel() for p in model.parameters())
+        logger.warning("FLUX model haven't implement get_nparams_and_flops() function")
+        return nparams, 1
+class FluxModel(nn.Module, ModelProtocol):
+    """
+    Transformer model for flow matching on sequences.
+    Agrs:
+        model_args: FluxModelArgs.
+    Attributes:
+        model_args (TransformerModelArgs): Model configuration arguments.
+    """
+    def __init__(self, model_args: FluxModelArgs):
+        super().__init__()
+        self.model_args = model_args
+        self.in_channels = model_args.in_channels
+        self.out_channels = model_args.out_channels
+        if model_args.hidden_size % model_args.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {model_args.hidden_size} must be divisible by num_heads {model_args.num_heads}"
+            )
+        pe_dim = model_args.hidden_size // model_args.num_heads
+        if sum(model_args.axes_dim) != pe_dim:
+            raise ValueError(
+                f"Got {model_args.axes_dim} but expected positional dim {pe_dim}"
+            )
+        self.hidden_size = model_args.hidden_size
+        self.num_heads = model_args.num_heads
+        self.pe_embedder = EmbedND(
+            dim=pe_dim, theta=model_args.theta, axes_dim=model_args.axes_dim
+        )
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(model_args.vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+            if model_args.guidance_embed
+            else nn.Identity()
+        )
+        self.txt_in = nn.Linear(model_args.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=model_args.mlp_ratio,
+                    qkv_bias=model_args.qkv_bias,
+                )
+                for _ in range(model_args.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size, self.num_heads, mlp_ratio=model_args.mlp_ratio
+                )
+                for _ in range(model_args.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+    def init_weights(self, buffer_device=None):
+        # TODO(jianiw): replace placeholder with real weight init
+        for param in self.parameters():
+            param.data.uniform_(0, 0.1)
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.model_args.guidance_embed:
+            if guidance is None:
+                raise ValueError(
+                    "Didn't get guidance strength for guidance distilled model."
+                )
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
+    @classmethod
+    def from_model_args(cls, model_args: FluxModelArgs) -> "FluxModel":
+        """
+        Initialize a Flux model from a FluxModelArgs object.
+        Args:
+            model_args (FluxModelArgs): Model configuration arguments.
+        Returns:
+            FluxModel: FluxModel model.
+        """
+        return cls(model_args)

torchtitan/experiments/flux/tests/test_flux_dataloader.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+from torchtitan.config_manager import JobConfig
+from torchtitan.experiments.flux.dataset.flux_dataset import build_flux_dataloader
+from torchtitan.tools.profiling import (
+    maybe_enable_memory_snapshot,
+    maybe_enable_profiling,
+)
+class TestFluxDataLoader:
+    def test_flux_dataloader(self):
+        dataset_name = "cc12m"
+        batch_size = 32
+        world_size = 4
+        rank = 0
+        num_steps = 10
+        path = "torchtitan.experiments.flux.flux_argparser"
+        sys.argv.append(f"--experimental.custom_args_module={path}")
+        config = JobConfig()
+        config.maybe_add_custom_args()
+        config.parse_args(
+            [
+                # Profiling options
+                # "--profiling.enable_profiling",
+                # "--profiling.profile_freq",
+                # "5",
+                # "--profiling.enable_memory_snapshot",
+                # "--profiling.save_memory_snapshot_folder",
+                # "memory_snapshot_flux",
+                "--training.dataset",
+                dataset_name,
+                "--training.batch_size",
+                str(batch_size),
+                "--encoder.t5_encoder",
+                "google/t5-v1_1-small",
+                "--encoder.clip_encoder",
+                "openai/clip-vit-large-patch14",
+                "--encoder.max_t5_encoding_len",
+                "512",
+            ]
+        )
+        with maybe_enable_profiling(
+            config, global_step=0
+        ) as torch_profiler, maybe_enable_memory_snapshot(
+            config, global_step=0
+        ) as memory_profiler:
+            dl = self._build_dataloader(
+                config,
+                world_size,
+                rank,
+            )
+            dl = iter(dl)
+            for i in range(0, num_steps):
+                input_data, labels = next(dl)
+                print(f"Step {i} image size: {labels.shape}")
+                if torch_profiler:
+                    torch_profiler.step()
+                if memory_profiler:
+                    memory_profiler.step()
+                print(len(input_data["clip_tokens"]))
+                for k, v in input_data.items():
+                    print(f"Step {i} {k} value: {type(v), v.shape}")
+                assert len(input_data) == 2  # (clip_encodings, t5_encodings)
+                assert labels.shape == (batch_size, 3, 256, 256)
+                # assert input_data["clip_tokens"].shape[0] == batch_size
+                # assert input_data["t5_tokens"].shape == (batch_size, 512, 512)
+            if torch_profiler:
+                torch_profiler.step()
+            if memory_profiler:
+                memory_profiler.step(exit_ctx=True)
+    def test_preprocess(self):
+        # TODO
+        pass
+    def _build_dataloader(
+        self,
+        job_config,
+        world_size,
+        rank,
+    ):
+        return build_flux_dataloader(
+            dp_world_size=world_size,
+            dp_rank=rank,
+            job_config=job_config,
+            tokenizer=None,
+            infinite=False,
+        )

torchtitan/experiments/flux/tests/test_generate_image.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import os
+import time
+from typing import Callable
+import torch
+from einops import rearrange
+from PIL import ExifTags, Image
+from torch import Tensor
+from torchtitan.experiments.flux.dataset.tokenizer import FluxTokenizer
+from torchtitan.experiments.flux.model.autoencoder import (
+    AutoEncoder,
+    AutoEncoderParams,
+    load_ae,
+)
+from torchtitan.experiments.flux.model.hf_embedder import FluxEmbedder
+from torchtitan.experiments.flux.model.model import FluxModel, FluxModelArgs
+from torchtitan.experiments.flux.utils import (
+    create_position_encoding_for_latents,
+    generate_noise_latent,
+    pack_latents,
+    preprocess_flux_data,
+    unpack_latents,
+)
+def time_shift(mu: float, sigma: float, t: Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def get_lin_function(
+    x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
+) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # estimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+    return timesteps.tolist()
+class TestGenerateImage:
+    def test_generate_image(self):
+        """
+        Run a forward pass of flux model to generate an image.
+        """
+        name = "flux-dev"
+        img_width = 512
+        img_height = 512
+        seed = None
+        prompt = (
+            "a photo of a forest with mist swirling around the tree trunks. The word "
+            '"FLUX" is painted over it in big, red brush strokes with visible texture'
+        )
+        device = "cuda"
+        num_steps = None
+        loop = False
+        guidance = 3.5
+        output_dir = "output"
+        add_sampling_metadata = True
+        prompt = prompt.split("|")
+        if len(prompt) == 1:
+            prompt = prompt[0]
+            additional_prompts = None
+        else:
+            additional_prompts = prompt[1:]
+            prompt = prompt[0]
+        assert not (
+            (additional_prompts is not None) and loop
+        ), "Do not provide additional prompts and set loop to True"
+        torch_device = torch.device(device)
+        if num_steps is None:
+            num_steps = 30
+        # allow for packing and conversion to latent space
+        img_height = 16 * (img_height // 16)
+        img_width = 16 * (img_width // 16)
+        # init all components
+        model = FluxModel(FluxModelArgs()).to(device=torch_device, dtype=torch.bfloat16)
+        ae = load_ae(
+            ckpt_path="assets/autoencoder/ae.safetensors",
+            autoencoder_params=AutoEncoderParams(),
+            device=torch_device,
+            dtype=torch.bfloat16,
+        )
+        clip_tokenizer = FluxTokenizer(
+            model_path="openai/clip-vit-large-patch14", max_length=77
+        )
+        t5_tokenizer = FluxTokenizer(model_path="google/t5-v1_1-small", max_length=512)
+        clip_encoder = FluxEmbedder(version="openai/clip-vit-large-patch14").to(
+            torch_device, dtype=torch.bfloat16
+        )
+        t5_encoder = FluxEmbedder(version="google/t5-v1_1-small").to(
+            torch_device, dtype=torch.bfloat16
+        )
+        rng = torch.Generator(device="cpu")
+        if seed is None:
+            seed = rng.seed()
+        print(f"Generating with seed {seed}:\n{prompt}")
+        t0 = time.perf_counter()
+        output_name = os.path.join(output_dir, f"img_{seed}.jpg")
+        # Tokenize the prompt, on CPU
+        clip_tokens = clip_tokenizer.encode(prompt)
+        t5_tokens = t5_tokenizer.encode(prompt)
+        batch = preprocess_flux_data(
+            device=torch_device,
+            dtype=torch.bfloat16,
+            autoencoder=None,
+            clip_encoder=clip_encoder,
+            t5_encoder=t5_encoder,
+            batch={
+                "clip_tokens": clip_tokens,
+                "t5_tokens": t5_tokens,
+            },
+        )
+        img = self._generate_images(
+            device=torch_device,
+            dtype=torch.bfloat16,
+            model=model,
+            decoder=ae,
+            img_width=img_width,
+            img_height=img_height,
+            denoising_steps=num_steps,
+            seed=seed,
+            clip_encodings=batch["clip_encodings"],
+            t5_encodings=batch["t5_encodings"],
+            guidance=guidance,
+        )
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        t1 = time.perf_counter()
+        print(f"Done in {t1 - t0:.1f}s.")
+        self._save_image(name, output_name, img, add_sampling_metadata, prompt)
+    def _generate_images(
+        self,
+        device: torch.device,
+        dtype: torch.dtype,
+        model: FluxModel,
+        decoder: AutoEncoder,
+        # image params:
+        img_width: int,
+        img_height: int,
+        # sampling params:
+        denoising_steps: int,
+        seed: int,
+        clip_encodings: torch.Tensor,
+        t5_encodings: torch.Tensor,
+        guidance: float = 4.0,
+    ):
+        bsz = clip_encodings.shape[0]
+        latents = generate_noise_latent(bsz, img_height, img_width, device, dtype, seed)
+        _, latent_channels, latent_height, latent_width = latents.shape
+        # create denoising schedule
+        timesteps = get_schedule(denoising_steps, latent_channels, shift=True)
+        # create positional encodings
+        POSITION_DIM = 3  # constant for Flux flow model
+        latent_pos_enc = create_position_encoding_for_latents(
+            bsz, latent_height, latent_width, POSITION_DIM
+        ).to(latents)
+        text_pos_enc = torch.zeros(bsz, t5_encodings.shape[1], POSITION_DIM).to(latents)
+        # convert img-like latents into sequences of patches
+        latents = pack_latents(latents)
+        # this is ignored for schnell
+        guidance_vec = torch.full((bsz,), guidance, device=device, dtype=dtype)
+        for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
+            t_vec = torch.full((bsz,), t_curr, dtype=dtype, device=device)
+            pred = model(
+                img=latents,
+                img_ids=latent_pos_enc,
+                txt=t5_encodings,
+                txt_ids=text_pos_enc,
+                y=clip_encodings,
+                timesteps=t_vec,
+                guidance=guidance_vec,
+            )
+            latents = latents + (t_prev - t_curr) * pred
+        # convert sequences of patches into img-like latents
+        latents = unpack_latents(latents, latent_height, latent_width)
+        img = decoder.decode(latents)
+        return img
+    def _save_image(
+        self,
+        name: str,
+        output_name: str,
+        x: torch.Tensor,
+        add_sampling_metadata: bool,
+        prompt: str,
+    ):
+        print(f"Saving {output_name}")
+        # bring into PIL format and save
+        x = x.clamp(-1, 1)
+        x = rearrange(x[0], "c h w -> h w c")
+        img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())
+        exif_data = Image.Exif()
+        exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux"
+        exif_data[ExifTags.Base.Make] = "Black Forest Labs"
+        exif_data[ExifTags.Base.Model] = name
+        if add_sampling_metadata:
+            exif_data[ExifTags.Base.ImageDescription] = prompt
+        img.save(output_name, exif=exif_data, quality=95, subsampling=0)

torchtitan/experiments/flux/train_configs/debug_model.toml ADDED Viewed

	@@ -0,0 +1,68 @@

+[job]
+dump_folder = "./outputs"
+description = "Flux debug model"
+print_args = false
+use_for_integration_test = true
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 10
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+[model]
+name = "flux"
+flavor = "flux-debug"
+norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm
+# test tokenizer.model, for debug purpose only
+# tokenizer_path = "./tests/assets/test_tiktoken.model"
+# converters = "float8"
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+[lr_scheduler]
+warmup_steps = 2  # lr scheduler warm up, normally 20% of the train steps
+decay_ratio = 0.8  # lr scheduler decay ratio, 80% of the train steps
+decay_type = "linear"
+lr_min = 0.0
+[training]
+batch_size = 32
+seq_len = 512
+max_norm = 1.0  # grad norm clipping
+steps = 10
+compile = false
+dataset = "cc12m"
+guidance = 3.5
+seed = 0
+[encoder]
+t5_encoder="google/t5-v1_1-small"
+clip_encoder="openai/clip-vit-large-patch14"
+max_t5_encoding_len=512
+auto_encoder_path="torchtitan/experiments/flux/assets/autoencoder/ae.safetensors"  # Autoencoder to use for image
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = 1
+fsdp_reshard_after_forward = "default" # default / never / always
+tensor_parallel_degree = 1
+enable_async_tensor_parallel = false
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+[experimental]
+custom_args_module = "torchtitan.experiments.flux.flux_argparser"

torchtitan/experiments/kernels/triton_mg_group_gemm/benchmark.py ADDED Viewed

	@@ -0,0 +1,630 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Benchmark comparing reference PyTorch vs optimized M*G group GEMM implementation
+import argparse
+import logging
+import time
+# from typing import Dict, List, Optional, Tuple
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import triton
+# import triton.language as tl
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+# Try to import the optimized implementations
+try:
+    from torchao_pr.mg_grouped_gemm import grouped_gemm_forward
+except ImportError:
+    logging.error(
+        "Error importing MG grouped GEMM modules. Make sure the implementation files are in the correct path."
+    )
+    raise
+def compute_reference_forward(x, w, m_sizes):
+    """
+    Reference PyTorch implementation of M*G grouped GEMM forward pass.
+    Args:
+        x (torch.Tensor): Input tensor of shape (M, K)
+        w (torch.Tensor): Weight tensor of shape (N, K)
+        m_sizes (torch.Tensor): Group sizes tensor of shape (G)
+    Returns:
+        torch.Tensor: Output tensor of shape (M, N)
+    """
+    result = torch.zeros((x.shape[0], w.shape[0]), dtype=x.dtype, device=x.device)
+    m_start = 0
+    for g in range(len(m_sizes)):
+        m_size = m_sizes[g].item()
+        if m_size > 0:
+            m_end = m_start + m_size
+            # Extract group input
+            x_g = x[m_start:m_end]
+            # Compute group output
+            y_g = torch.matmul(x_g, w.T)
+            # Store result
+            result[m_start:m_end] = y_g
+            # Update start index
+            m_start = m_end
+    return result
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["N"],  # We'll vary the output dimension
+        x_vals=[1024, 2048, 4096, 8192, 16384],  # Different output dimensions to test
+        # x_vals=[8192, 16384],
+        line_arg="provider",  # We'll compare different providers
+        line_vals=["pytorch_reference", "M*G grouped GEMM"],
+        line_names=["PyTorch Reference", "M*G grouped Kernel"],
+        styles=[("blue", "-"), ("red", "-")],
+        ylabel="TFLOPS",  # We'll measure TFLOPS
+        plot_name="mg_grouped_gemm_comparison",
+        args={
+            "M": 8192,  # Batch dimension, fixed for all tests
+            "K": 7168,  # Hidden dimension, fixed for all tests
+            "G": 8,  # Number of groups
+            "dtype": torch.float16,
+            "device": "cuda",
+        },
+    )
+)
+def benchmark_forward(M, K, N, G, provider, dtype=torch.float16, device="cuda"):
+    """
+    Benchmark the forward pass of the grouped GEMM implementation.
+    Args:
+        M (int): Total batch size dimension
+        K (int): Hidden dimension
+        N (int): Output dimension
+        G (int): Number of groups
+        provider (str): Provider to use ('pytorch_reference' or 'optimized_kernel')
+        dtype (torch.dtype): Data type to use
+        device (str): Device to use
+    Returns:
+        float: Performance in TFLOPS
+    """
+    # Create group sizes for M dimension (balanced across groups)
+    base_size = M // G
+    remainder = M % G
+    M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+    m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+    print(f"N: {N}, M: {M}, K: {K}, G: {G}, dtype: {dtype}, device: {device}")
+    # Create input and weight tensors
+    x = torch.randn(M, K, dtype=dtype, device=device)
+    w = torch.randn(N, K, dtype=dtype, device=device)
+    # Pre-compute for PyTorch reference to ensure fair comparison
+    if provider == "pytorch_reference":
+        # Warmup
+        torch.cuda.synchronize()
+        compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        # Benchmark
+        start_time = time.time()
+        for _ in range(10):  # Average over 10 runs
+            compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+    else:  # Optimized kernel
+        # Warmup
+        torch.cuda.synchronize()
+        grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        # Benchmark
+        start_time = time.time()
+        for _ in range(10):  # Average over 10 runs
+            grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+    # Calculate FLOPs
+    # For GEMM: 2 * M * N * K FLOPs (multiply-add counts as 2 FLOPs)
+    flops = 2 * M * N * K
+    # Convert to TFLOPS (tera-FLOPS)
+    avg_time = (end_time - start_time) / 10  # Average time per run
+    tflops = flops / avg_time / 1e12
+    return tflops
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["G"],  # We'll vary the number of groups
+        x_vals=[1, 2, 4, 8, 16],  # Different numbers of groups to test
+        line_arg="provider",  # We'll compare different providers
+        line_vals=["pytorch_reference", "optimized_kernel"],
+        line_names=["PyTorch Reference", "Optimized Kernel"],
+        styles=[("blue", "-"), ("red", "-")],
+        ylabel="TFLOPS",  # We'll measure TFLOPS
+        plot_name="mg_grouped_gemm_group_scaling",
+        args={
+            "M": 8192,  # Batch dimension, fixed for all tests
+            "K": 4096,  # Hidden dimension, fixed for all tests
+            "N": 8192,  # Output dimension, fixed for all tests
+            "dtype": torch.float16,
+            "device": "cuda",
+        },
+    )
+)
+def benchmark_forward_groups(M, K, N, G, provider, dtype=torch.float16, device="cuda"):
+    """
+    Benchmark how performance scales with number of groups.
+    Args:
+        M (int): Total batch size dimension
+        K (int): Hidden dimension
+        N (int): Output dimension
+        G (int): Number of groups
+        provider (str): Provider to use ('pytorch_reference' or 'optimized_kernel')
+        dtype (torch.dtype): Data type to use
+        device (str): Device to use
+    Returns:
+        float: Performance in TFLOPS
+    """
+    # Create group sizes for M dimension (balanced across groups)
+    base_size = M // G
+    remainder = M % G
+    M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+    m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+    # Create input and weight tensors
+    x = torch.randn(M, K, dtype=dtype, device=device)
+    w = torch.randn(N, K, dtype=dtype, device=device)
+    # Benchmark logic - same as previous function
+    if provider == "pytorch_reference":
+        torch.cuda.synchronize()
+        compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(10):
+            compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+    else:
+        torch.cuda.synchronize()
+        grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(10):
+            grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+    # Calculate FLOPs and TFLOPS
+    flops = 2 * M * N * K
+    avg_time = (end_time - start_time) / 10
+    tflops = flops / avg_time / 1e12
+    return tflops
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["group_balance"],  # We'll vary the group balance factor
+        x_vals=[
+            0.0,
+            0.25,
+            0.5,
+            0.75,
+            0.9,
+        ],  # Different imbalance factors (0 = balanced, 1 = max imbalance)
+        line_arg="provider",  # We'll compare different providers
+        line_vals=["pytorch_reference", "optimized_kernel"],
+        line_names=["PyTorch Reference", "Optimized Kernel"],
+        styles=[("blue", "-"), ("red", "-")],
+        ylabel="TFLOPS",  # We'll measure TFLOPS
+        plot_name="mg_grouped_gemm_imbalance",
+        args={
+            "M": 8192,  # Batch dimension, fixed for all tests
+            "K": 4096,  # Hidden dimension, fixed for all tests
+            "N": 8192,  # Output dimension, fixed for all tests
+            "G": 4,  # Number of groups
+            "dtype": torch.float16,
+            "device": "cuda",
+        },
+    )
+)
+def benchmark_imbalance(
+    M, K, N, G, group_balance, provider, dtype=torch.float16, device="cuda"
+):
+    """
+    Benchmark how performance is affected by imbalanced group sizes.
+    Args:
+        M (int): Total batch size dimension
+        K (int): Hidden dimension
+        N (int): Output dimension
+        G (int): Number of groups
+        group_balance (float): Balance factor from 0 to 1 (0 = balanced, 1 = max imbalance)
+        provider (str): Provider to use ('pytorch_reference' or 'optimized_kernel')
+        dtype (torch.dtype): Data type to use
+        device (str): Device to use
+    Returns:
+        float: Performance in TFLOPS
+    """
+    # Create imbalanced group sizes for M dimension
+    if group_balance == 0:
+        # Balanced case
+        base_size = M // G
+        remainder = M % G
+        M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+    else:
+        # Imbalanced case
+        # First group gets more elements, last group gets fewer
+        # The imbalance is controlled by the group_balance factor
+        remaining = M
+        M_sizes = []
+        for g in range(G):
+            # Interpolate from balanced to imbalanced based on group_balance
+            # For balanced (group_balance=0), each group gets M/G
+            # For imbalanced (group_balance=1), first group gets much more than last group
+            balanced_size = remaining // (G - g)
+            # Adjusting size based on position and imbalance factor
+            # First groups get more, last groups get less
+            if g < G // 2:
+                # First half of groups get more
+                adjustment = int(balanced_size * group_balance * (1 - g / (G - 1)))
+                size = balanced_size + adjustment
+            else:
+                # Second half of groups get less
+                adjustment = int(balanced_size * group_balance * ((g / (G - 1)) - 0.5))
+                size = balanced_size - adjustment
+            # Ensure we don't go below 1 or take more than remaining
+            size = max(1, min(size, remaining))
+            M_sizes.append(size)
+            remaining -= size
+        # Handle any remaining elements
+        if remaining > 0:
+            M_sizes[-1] += remaining
+    m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+    # Create input and weight tensors
+    x = torch.randn(M, K, dtype=dtype, device=device)
+    w = torch.randn(N, K, dtype=dtype, device=device)
+    # Benchmark logic
+    if provider == "pytorch_reference":
+        torch.cuda.synchronize()
+        compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(10):
+            compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+    else:
+        torch.cuda.synchronize()
+        grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(10):
+            grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+    # Calculate FLOPs and TFLOPS
+    flops = 2 * M * N * K
+    avg_time = (end_time - start_time) / 10
+    tflops = flops / avg_time / 1e12
+    return tflops
+def benchmark_model_configs():
+    """
+    Benchmark common model configurations used in DeepSeek-like models.
+    """
+    # Model configurations: (M, K, N, G)
+    configs = [
+        (8192, 7168, 4096, 4),  # Config 1
+        (8192, 2048, 7168, 4),  # Config 2
+        (4096, 7168, 4096, 8),  # Config 3
+        (4096, 2048, 7168, 8),  # Config 4
+    ]
+    results = []
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = torch.float16
+    for config_idx, (M, K, N, G) in enumerate(configs):
+        logging.info(f"\n===== Benchmarking DeepSeek Config {config_idx + 1} =====")
+        logging.info(f"M={M}, K={K}, N={N}, G={G}")
+        # Create group sizes for M dimension
+        base_size = M // G
+        remainder = M % G
+        M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+        m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+        # Create tensors
+        x = torch.randn(M, K, dtype=dtype, device=device)
+        w = torch.randn(N, K, dtype=dtype, device=device)
+        # Benchmark PyTorch reference
+        torch.cuda.synchronize()
+        compute_reference_forward(x, w, m_sizes)  # Warmup
+        torch.cuda.synchronize()
+        logging.info("Benchmarking PyTorch reference...")
+        torch.cuda.reset_peak_memory_stats()
+        start_time = time.time()
+        for _ in range(10):
+            compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+        pt_time = (end_time - start_time) / 10
+        pt_memory = torch.cuda.max_memory_allocated() / (1024**2)  # MB
+        # Benchmark optimized kernel
+        torch.cuda.synchronize()
+        grouped_gemm_forward(x, w, m_sizes)  # Warmup
+        torch.cuda.synchronize()
+        logging.info("Benchmarking optimized kernel...")
+        torch.cuda.reset_peak_memory_stats()
+        start_time = time.time()
+        for _ in range(10):
+            grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+        opt_time = (end_time - start_time) / 10
+        opt_memory = torch.cuda.max_memory_allocated() / (1024**2)  # MB
+        # Calculate FLOPs and speedup
+        flops = 2 * M * N * K
+        pt_tflops = flops / pt_time / 1e12
+        opt_tflops = flops / opt_time / 1e12
+        speedup = pt_time / opt_time
+        # Store results
+        results.append(
+            {
+                "config": f"Config {config_idx + 1}",
+                "dimensions": f"M={M}, K={K}, N={N}, G={G}",
+                "pt_time_ms": pt_time * 1000,
+                "opt_time_ms": opt_time * 1000,
+                "pt_tflops": pt_tflops,
+                "opt_tflops": opt_tflops,
+                "speedup": speedup,
+                "pt_memory_mb": pt_memory,
+                "opt_memory_mb": opt_memory,
+                "memory_savings": (
+                    (pt_memory - opt_memory) / pt_memory * 100 if pt_memory > 0 else 0
+                ),
+            }
+        )
+        logging.info(
+            f"PyTorch Reference: {pt_time * 1000:.2f} ms, {pt_tflops:.2f} TFLOPS, {pt_memory:.2f} MB"
+        )
+        logging.info(
+            f"Optimized Kernel: {opt_time * 1000:.2f} ms, {opt_tflops:.2f} TFLOPS, {opt_memory:.2f} MB"
+        )
+        logging.info(
+            f"Speedup: {speedup:.2f}x, Memory savings: {results[-1]['memory_savings']:.2f}%"
+        )
+    # Print summary table
+    logging.info("\n===== Benchmark Results Summary =====")
+    logging.info(
+        f"{'Config':<10} | {'Time (ms)':<20} | {'TFLOPS':<20} | {'Speedup':<10} | {'Memory (MB)':<20} | {'Memory Saved':<12}"
+    )
+    logging.info(
+        f"{'':<10} | {'PyTorch':<9} {'Kernel':<9} | {'PyTorch':<9} {'Kernel':<9} | {'':<10} | "
+        f"{'PyTorch':<9} {'Kernel':<9} | {'':<12}"
+    )
+    logging.info("-" * 100)
+    for result in results:
+        logging.info(
+            f"{result['config']:<10} | "
+            f"{result['pt_time_ms']:<9.2f} {result['opt_time_ms']:<9.2f} | "
+            f"{result['pt_tflops']:<9.2f} {result['opt_tflops']:<9.2f} | "
+            f"{result['speedup']:<10.2f} | "
+            f"{result['pt_memory_mb']:<9.2f} {result['opt_memory_mb']:<9.2f} | "
+            f"{result['memory_savings']:<12.2f}%"
+        )
+    return results
+def plot_benchmark_results(results):
+    """
+    Plot benchmark results as bar charts.
+    """
+    # Extract data
+    configs = [r["config"] for r in results]
+    pt_tflops = [r["pt_tflops"] for r in results]
+    opt_tflops = [r["opt_tflops"] for r in results]
+    speedups = [r["speedup"] for r in results]
+    # Create figure with subplots
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
+    # Plot TFLOPS comparison
+    x = np.arange(len(configs))
+    width = 0.35
+    ax1.bar(x - width / 2, pt_tflops, width, label="PyTorch Reference")
+    ax1.bar(x + width / 2, opt_tflops, width, label="Optimized Kernel")
+    ax1.set_xlabel("Model Configuration")
+    ax1.set_ylabel("TFLOPS")
+    ax1.set_title("Performance Comparison (Higher is Better)")
+    ax1.set_xticks(x)
+    ax1.set_xticklabels(configs)
+    ax1.legend()
+    ax1.grid(axis="y", linestyle="--", alpha=0.7)
+    # Plot speedup
+    ax2.bar(x, speedups, width=0.6, color="green")
+    ax2.set_xlabel("Model Configuration")
+    ax2.set_ylabel("Speedup (x)")
+    ax2.set_title("Speedup Factor (Higher is Better)")
+    ax2.set_xticks(x)
+    ax2.set_xticklabels(configs)
+    ax2.grid(axis="y", linestyle="--", alpha=0.7)
+    # Add speedup values on top of bars
+    for i, v in enumerate(speedups):
+        ax2.text(i, v + 0.1, f"{v:.2f}x", ha="center")
+    plt.tight_layout()
+    plt.savefig("mg_grouped_gemm_benchmark_results.png")
+    logging.info(
+        "Benchmark results plot saved to 'mg_grouped_gemm_benchmark_results.png'"
+    )
+def compare_mg_implementations():
+    """
+    Combine the M*G and N*G benchmark results for comparison.
+    """
+    # Only run this if both NG and MG benchmarks have been run
+    try:
+        import pandas as pd
+        # Try to load previous benchmark results
+        mg_results = pd.read_csv("mg_grouped_gemm_benchmark_results.csv")
+        ng_results = pd.read_csv("ng_grouped_gemm_benchmark_results.csv")
+        # Create comparison plot
+        fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+        # Plot speedup comparison
+        configs = mg_results["config"].unique()
+        mg_speedups = mg_results.groupby("config")["speedup"].mean()
+        ng_speedups = ng_results.groupby("config")["speedup"].mean()
+        x = np.arange(len(configs))
+        width = 0.35
+        axes[0].bar(x - width / 2, mg_speedups, width, label="M*G Grouping")
+        axes[0].bar(x + width / 2, ng_speedups, width, label="N*G Grouping")
+        axes[0].set_xlabel("Model Configuration")
+        axes[0].set_ylabel("Speedup (x)")
+        axes[0].set_title("Speedup Comparison: M*G vs N*G")
+        axes[0].set_xticks(x)
+        axes[0].set_xticklabels(configs)
+        axes[0].legend()
+        axes[0].grid(axis="y", linestyle="--", alpha=0.7)
+        # Plot TFLOPS comparison for optimized kernels
+        mg_tflops = (
+            mg_results[mg_results["implementation"] == "optimized"]
+            .groupby("config")["tflops"]
+            .mean()
+        )
+        ng_tflops = (
+            ng_results[ng_results["implementation"] == "optimized"]
+            .groupby("config")["tflops"]
+            .mean()
+        )
+        axes[1].bar(x - width / 2, mg_tflops, width, label="M*G Grouping")
+        axes[1].bar(x + width / 2, ng_tflops, width, label="N*G Grouping")
+        axes[1].set_xlabel("Model Configuration")
+        axes[1].set_ylabel("TFLOPS")
+        axes[1].set_title("Performance Comparison: M*G vs N*G")
+        axes[1].set_xticks(x)
+        axes[1].set_xticklabels(configs)
+        axes[1].legend()
+        axes[1].grid(axis="y", linestyle="--", alpha=0.7)
+        plt.tight_layout()
+        plt.savefig("mg_vs_ng_comparison.png")
+        logging.info("Comparison plot saved to 'mg_vs_ng_comparison.png'")
+    except Exception as e:
+        logging.error(f"Could not create comparison plot: {e}")
+        logging.info(
+            "Run both M*G and N*G benchmarks first to generate comparison plots"
+        )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark M*G Grouped GEMM implementations"
+    )
+    parser.add_argument("--run-all", action="store_true", help="Run all benchmarks")
+    parser.add_argument(
+        "--triton-bench", action="store_true", help="Run Triton performance reports"
+    )
+    parser.add_argument(
+        "--model-configs", action="store_true", help="Benchmark model configurations"
+    )
+    parser.add_argument(
+        "--compare-mg-ng",
+        action="store_true",
+        help="Compare M*G and N*G implementations",
+    )
+    args = parser.parse_args()
+    # Check if CUDA is available
+    if not torch.cuda.is_available():
+        logging.error(
+            "CUDA is not available. This benchmark requires a CUDA-capable GPU."
+        )
+        exit(1)
+    if args.run_all or args.model_configs:
+        # Benchmark model configurations
+        logging.info("Running benchmark for model configurations...")
+        results = benchmark_model_configs()
+        plot_benchmark_results(results)
+    if args.run_all or args.triton_bench:
+        # Run Triton performance reports
+        logging.info("Running Triton performance reports...")
+        benchmark_forward.run(save_path="mg_grouped_gemm_benchmark_results")
+        benchmark_forward_groups.run(save_path="mg_grouped_gemm_benchmark_results")
+        benchmark_imbalance.run(save_path="mg_grouped_gemm_benchmark_results")
+        logging.info(
+            "Triton performance reports saved to 'mg_grouped_gemm_benchmark_results' directory"
+        )
+    if args.run_all or args.compare_mg_ng:
+        # Compare M*G and N*G implementations
+        logging.info("Comparing M*G and N*G implementations...")
+        compare_mg_implementations()

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import logging
+import unittest
+from typing import Tuple
+import torch
+import torch.nn as nn
+from mg_grouped_gemm import grouped_gemm_forward
+class TestMG_GroupedGEMM(unittest.TestCase):
+    def setUp(self) -> None:
+        torch.manual_seed(2020)
+    def _run_grouped_gemm_test(
+        self,
+        shape: Tuple[int, int, int, int],
+        device: torch.device,
+        dtype: torch.dtype = torch.bfloat16,
+        atol: float = 1e-5,
+        rtol: float = 1.6e-2,
+    ) -> None:
+        G, M, N, K = shape
+        # In M*G grouping, input is [M*G, K] and weights are [N*G, K]
+        a = torch.randn(M * G, K, dtype=dtype, device=device)
+        b = torch.randn(N * G, K, dtype=dtype, device=device)
+        # Create equal-sized groups for simplicity
+        m_size = M
+        m_sizes = torch.full((G,), m_size, device=device, dtype=torch.int32)
+        result = grouped_gemm_forward(a, b, m_sizes)
+        self.assertTrue(result.shape == (M * G, N))
+        expected_result = torch.zeros(M * G, N, dtype=dtype, device=device)
+        m_start = 0
+        for g in range(G):
+            m_end = m_start + m_sizes[g]
+            b_slice = b[N * g : N * (g+1), :]
+            expected_result[m_start:m_end, :] = a[m_start:m_end, :] @ b_slice.T
+            m_start = m_end
+        # Convert result to match input dtype if needed
+        result = result.to(dtype)
+        torch.testing.assert_close(result, expected_result, atol=atol, rtol=rtol)
+    def test_MG_grouped_gemm_bf16(self) -> None:
+        for G in (1, 4, 16):
+            for M in (128, 512, 1024):
+                print(f"Testing BF16 M*G GroupGeMM with G={G}, M={M}")
+                self._run_grouped_gemm_test(
+                    (G, M, 1024, 1024),
+                    torch.device("cuda"),
+                    dtype=torch.bfloat16,
+                    atol=1e-5,
+                    rtol=1.6e-2,
+                )
+    def test_MG_grouped_gemm_deepseek_shapes(self) -> None:
+        """Test with shapes from Deepseek model."""
+        deepseek_shapes = [
+            (4, 2048, 4096, 7168),  # G, M, N, K
+            (4, 2048, 7168, 2048),
+            (8, 512, 4096, 7168),
+            (8, 512, 7168, 2048),
+        ]
+        device = torch.device("cuda")
+        for shape in deepseek_shapes:
+            G, M, N, K = shape
+            print(f"Testing BF16 M*G Deepseek shape: G={G}, M={M}, N={N}, K={K}")
+            self._run_grouped_gemm_test(
+                shape, device, dtype=torch.bfloat16, atol=1e-5, rtol=1.6e-2
+            )