zaydzuhri commited on Sep 15, 2025

Commit

8a45d34

verified ·

1 Parent(s): d092603

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

profile_trace/iteration_10752/rank5_trace.json +0 -0
profile_trace/iteration_11776/rank3_trace.json +0 -0
profile_trace/iteration_11776/rank4_trace.json +0 -0
profile_trace/iteration_11776/rank7_trace.json +0 -0
profile_trace/iteration_12288/rank5_trace.json +0 -0
profile_trace/iteration_13824/rank2_trace.json +0 -0
profile_trace/iteration_13824/rank4_trace.json +0 -0
profile_trace/iteration_13824/rank6_trace.json +0 -0
profile_trace/iteration_14848/rank2_trace.json +0 -0
profile_trace/iteration_14848/rank4_trace.json +0 -0
profile_trace/iteration_14848/rank7_trace.json +0 -0
profile_trace/iteration_21504/rank0_trace.json +0 -0
profile_trace/iteration_21504/rank3_trace.json +0 -0
profile_trace/iteration_28160/rank2_trace.json +0 -0
profile_trace/iteration_28160/rank4_trace.json +0 -0
profile_trace/iteration_28160/rank6_trace.json +0 -0
profile_trace/iteration_31744/rank3_trace.json +0 -0
profile_trace/iteration_33792/rank1_trace.json +0 -0
profile_trace/iteration_33792/rank2_trace.json +0 -0
profile_trace/iteration_33792/rank4_trace.json +0 -0
profile_trace/iteration_33792/rank5_trace.json +0 -0
profile_trace/iteration_33792/rank6_trace.json +0 -0
profile_trace/iteration_33792/rank7_trace.json +0 -0
profile_trace/iteration_512/rank0_trace.json +0 -0
profile_trace/iteration_512/rank1_trace.json +0 -0
profile_trace/iteration_512/rank3_trace.json +0 -0
profile_trace/iteration_8192/rank0_trace.json +0 -0
profile_trace/iteration_8192/rank1_trace.json +0 -0
profile_trace/iteration_8192/rank5_trace.json +0 -0
torchtitan/components/__pycache__/checkpoint.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/dataloader.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/float8.cpython-312.pyc +0 -0
torchtitan/components/loss.py +29 -0
torchtitan/experiments/deepseek_v3/indices.py +195 -0
torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_barrier.py +159 -0
torchtitan/experiments/flux/__pycache__/__init__.cpython-312.pyc +0 -0
torchtitan/experiments/flux/__pycache__/utils.cpython-312.pyc +0 -0
torchtitan/experiments/flux/dataset/__pycache__/tokenizer.cpython-312.pyc +0 -0
torchtitan/experiments/flux/dataset/tokenizer.py +64 -0
torchtitan/experiments/flux/model/__pycache__/hf_embedder.cpython-312.pyc +0 -0
torchtitan/experiments/flux/model/__pycache__/layers.cpython-312.pyc +0 -0
torchtitan/experiments/flux/train_configs/debug_model.toml +68 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_backwards.py +174 -0
torchtitan/experiments/llama4/__pycache__/__init__.cpython-312.pyc +0 -0
torchtitan/experiments/llama4/model/__pycache__/moe.cpython-312.pyc +0 -0
torchtitan/experiments/llama4/model/args.py +109 -0
torchtitan/experiments/multimodal/__init__.py +37 -0
torchtitan/experiments/multimodal/tests/test_multimodal_model.py +128 -0
torchtitan/experiments/simple_fsdp/__pycache__/__init__.cpython-312.pyc +0 -0
torchtitan/models/__pycache__/norms.cpython-312.pyc +0 -0

profile_trace/iteration_10752/rank5_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_11776/rank3_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_11776/rank4_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_11776/rank7_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_12288/rank5_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_13824/rank2_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_13824/rank4_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_13824/rank6_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_14848/rank2_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_14848/rank4_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_14848/rank7_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_21504/rank0_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_21504/rank3_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_28160/rank2_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_28160/rank4_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_28160/rank6_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_31744/rank3_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_33792/rank1_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_33792/rank2_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_33792/rank4_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_33792/rank5_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_33792/rank6_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_33792/rank7_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_512/rank0_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_512/rank1_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_512/rank3_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_8192/rank0_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_8192/rank1_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

profile_trace/iteration_8192/rank5_trace.json ADDED Viewed

The diff for this file is too large to render. See raw diff

torchtitan/components/__pycache__/checkpoint.cpython-312.pyc ADDED Viewed

Binary file (33.1 kB). View file

torchtitan/components/__pycache__/dataloader.cpython-312.pyc ADDED Viewed

Binary file (3.79 kB). View file

torchtitan/components/__pycache__/float8.cpython-312.pyc ADDED Viewed

Binary file (6.2 kB). View file

torchtitan/components/loss.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable, TypeAlias
+import torch
+from torchtitan.config_manager import JobConfig
+from torchtitan.tools.logging import logger
+LossFunction: TypeAlias = Callable[..., torch.Tensor]
+def cross_entropy_loss(pred: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+    """Common cross-entropy loss function for Transformer models training."""
+    return torch.nn.functional.cross_entropy(
+        pred.flatten(0, 1).float(), labels.flatten(0, 1)
+    )
+def build_cross_entropy_loss(job_config: JobConfig):
+    loss_fn = cross_entropy_loss
+    if job_config.training.compile:
+        logger.info("Compiling the loss function with torch.compile")
+        loss_fn = torch.compile(loss_fn)
+    return loss_fn

torchtitan/experiments/deepseek_v3/indices.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import triton
+import triton.language as tl
+__all__ = ["generate_permute_indices"]
+@triton.jit
+def fill_indices_kernel(
+    tokens_per_expert_group_ptr,  # *Pointer* to first input vector.
+    start_index_values_ptr,  # *Pointer* to second input vector.
+    write_offsets_ptr,  # *Pointer* to third input vector.
+    output_ptr,  # *Pointer* to output vector.
+    experts_per_rank,  # Number of experts per rank.
+    num_ranks,  # Number of expert ranks.
+):
+    # There are multiple 'programs' processing different data. We identify which program
+    # we are here:
+    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+    # The total number of programs in the launch grid.
+    num_programs = tl.num_programs(axis=0)
+    # We map the programs (blocks) to the experts.
+    for expert_id in tl.range(pid, experts_per_rank, step=num_programs):
+        # Read this expert's write offset.
+        write_offset = tl.load(write_offsets_ptr + expert_id)
+        # Loop over the ranks.
+        for r in tl.range(num_ranks):
+            # Slot in the tokens_per_expert_group array.
+            i = r * experts_per_rank + expert_id
+            start_index = tl.load(start_index_values_ptr + i)
+            length = tl.load(tokens_per_expert_group_ptr + i)
+            # Write the indices.
+            for l in tl.range(length):
+                val = start_index + l
+                tl.store(output_ptr + write_offset + l, val)
+            write_offset += length
+def fill_indices(
+    tokens_per_expert_group: torch.Tensor,
+    start_index_values: torch.Tensor,
+    write_offsets: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+):
+    # We need to preallocate the output.
+    permuted_indices = torch.full(
+        (max_len,), -1, dtype=torch.int32, device=tokens_per_expert_group.device
+    )
+    # Analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int].
+    # In this case, we use a 1D grid where the size is the number of blocks (TODO: bump this value).
+    grid = lambda meta: (1,)
+    #  Each torch.tensor object is implicitly converted into a pointer to its first element.
+    fill_indices_kernel[grid](
+        tokens_per_expert_group,
+        start_index_values,
+        write_offsets,
+        permuted_indices,
+        experts_per_rank,
+        num_ranks,
+    )
+    return permuted_indices
+def fill_indices_cpu(
+    tokens_per_expert_group: torch.Tensor,
+    start_index_values: torch.Tensor,
+    write_offsets: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+):
+    # We need to preallocate the output.
+    permuted_indices = torch.full((max_len,), -1, dtype=torch.int32)
+    # Fill the permuted indices
+    # For each local expert
+    for e in range(experts_per_rank):
+        write_start = write_offsets[e]
+        # For each remote rank
+        for r in range(num_ranks):
+            i = r * experts_per_rank + e
+            start_index = start_index_values[i]
+            length = tokens_per_expert_group[i]
+            # Fill in the indices
+            permuted_indices[write_start : write_start + length] = torch.arange(
+                start_index, start_index + length
+            )
+            write_start += length
+    return permuted_indices
+def generate_permute_indices(
+    tokens_per_expert_group: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+    alignment: int,
+    use_cpu: bool = False,
+):
+    # Prepare permutation indices and the number of tokens for each expert.  The
+    # permutation indices are the indices of the tokens for each expert.  The
+    # number of tokens for each expert is the sum of the number of tokens for
+    # such experts from all ranks. This number is aligned to the provided
+    # alignment requirement (usually comes from group gemm).
+    # Args:
+    #     tokens_per_expert_group: number of tokens for each expert from all ranks.
+    #     experts_per_rank: number of experts per rank.
+    #     num_ranks: number of ranks.
+    #     max_len: maximum length of the output index vector. If greater than
+    #     total number of tokens, the remaining indices are set to -1.
+    #     alignment: alignment for each returned element in `m_sizes`.
+    #     use_cpu: whether to use cpu or gpu.
+    # Returns:
+    #     permuted_indices: permutation indices.
+    #     m_sizes: number of tokens for each expert.
+    # `tokens_per_expert_group` is of shape (num_ranks * experts_per_rank,), for example:
+    # From: |       rank 0      |       rank 1      |
+    # To:   | E0 | E1 | E2 | E3 | E0 | E1 | E2 | E3 |
+    #       |  4 |  2 |  1 |  3 |  1 |  2 |  3 |  4 |
+    # Prefix sum to get the start index value of each expert
+    start_index_values = (
+        torch.cumsum(tokens_per_expert_group, 0) - tokens_per_expert_group
+    )
+    # Chunk sizes for each expert
+    chunk_size_per_expert = tokens_per_expert_group.view(num_ranks, -1).sum(0)
+    # Align the chunk sizes to the given alignment
+    m_sizes = ((chunk_size_per_expert + alignment - 1) // alignment * alignment).to(
+        torch.int32
+    )
+    # Perform another prefix sum to get the write offset of each expert in `permuted_indices`
+    write_offsets = torch.cumsum(m_sizes, 0) - m_sizes
+    # Select the method to fill the permuted indices
+    fill_fn = fill_indices_cpu if use_cpu else fill_indices
+    # Fill the permuted indices
+    permuted_indices = fill_fn(
+        tokens_per_expert_group,
+        start_index_values,
+        write_offsets,
+        experts_per_rank,
+        num_ranks,
+        max_len,
+    )
+    return permuted_indices, m_sizes
+# Below is for testing only
+def test():
+    device = torch.device("cuda", 0)
+    experts_per_rank = 4
+    num_ranks = 4
+    tokens_per_expert_group = torch.full(
+        (num_ranks * experts_per_rank,), 4, dtype=torch.int32, device=device
+    )
+    max_len = 128
+    alignment = 32
+    # Use the GPU kernel
+    permuted_indices_gpu, m_sizes = generate_permute_indices(
+        tokens_per_expert_group, experts_per_rank, num_ranks, max_len, alignment
+    )
+    # Use the CPU method
+    permuted_indices_cpu, _ = generate_permute_indices(
+        tokens_per_expert_group,
+        experts_per_rank,
+        num_ranks,
+        max_len,
+        alignment,
+        use_cpu=True,
+    )
+    # Check that the results are the same
+    assert torch.equal(permuted_indices_gpu.cpu(), permuted_indices_cpu)
+    assert torch.equal(
+        torch.remainder(m_sizes, alignment),
+        torch.zeros(experts_per_rank, device=device),
+    )
+    # Print the results
+    print(permuted_indices_gpu)
+    print(m_sizes)
+    print("Success")
+if __name__ == "__main__":
+    test()

torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_barrier.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import triton
+import triton.language as tl
+from .triton_utils import get_flat_bid, get_flat_tid
+@triton.jit
+def send_signal(addrs, sem: tl.constexpr):
+    if sem == "relaxed":
+        tl.inline_asm_elementwise(
+            """
+            {
+                .reg .u32   %tmp32_<1>;
+                .reg .pred  %p<1>;
+                send_signal:
+                    atom.global.relaxed.sys.cas.b32 %tmp32_0, [$1], 0, 1;
+                    setp.eq.u32 %p0, %tmp32_0, 0;
+                    @!%p0 bra send_signal;
+            }
+            """,
+            "=r, l",
+            [addrs],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+    elif sem == "acq_rel":
+        tl.inline_asm_elementwise(
+            """
+            {
+                .reg .u32   %tmp32_<1>;
+                .reg .pred  %p<1>;
+                send_signal:
+                    atom.global.release.sys.cas.b32 %tmp32_0, [$1], 0, 1;
+                    setp.eq.u32 %p0, %tmp32_0, 0;
+                    @!%p0 bra send_signal;
+            }
+            """,
+            "=r, l",
+            [addrs],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+    else:
+        raise RuntimeError(f"Unrecognized sem: {sem}")
+@triton.jit
+def wait_signal(addrs, sem: tl.constexpr):
+    if sem == "relaxed":
+        tl.inline_asm_elementwise(
+            """
+            {
+                .reg .u32   %tmp32_<1>;
+                .reg .pred  %p<1>;
+                wait_signal:
+                    atom.global.sys.relaxed.cas.b32 %tmp32_0, [$1], 1, 0;
+                    setp.eq.u32 %p0, %tmp32_0, 1;
+                    @!%p0 bra wait_signal;
+            }
+            """,
+            "=r, l",
+            [addrs],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+    elif sem == "acq_rel":
+        tl.inline_asm_elementwise(
+            """
+            {
+                .reg .u32   %tmp32_<1>;
+                .reg .pred  %p<1>;
+                wait_signal:
+                    atom.global.sys.acquire.cas.b32 %tmp32_0, [$1], 1, 0;
+                    setp.eq.u32 %p0, %tmp32_0, 1;
+                    @!%p0 bra wait_signal;
+            }
+            """,
+            "=r, l",
+            [addrs],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+    else:
+        raise RuntimeError(f"Unrecognized sem: {sem}")
+@triton.jit
+def blockwise_barrier(
+    signal_pad_ptrs,
+    block_id,
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    sem: tl.constexpr,
+):
+    """
+    Synchronizes blocks with matching block_id across participating devices.
+    Note: the function itself is not a system level barrier/fence. It is a
+    building block for expressing different synchronization patterns.
+    Pattern 0: Ensures that all writes to symm_mem buffers from previous
+    kernels across all devices are visible to the current kernel:
+        blockwise_barrier(..., sem="relaxed")
+        sync_threads()
+    Pattern 1: Ensures that all writes to symm_mem buffers from the current
+    block are visible to all remote blocks with matching blockIdx:
+        sync_threads()
+        blockwise_barrier(..., sem="acq_rel")
+        sync_threads()
+    Pattern 2: Ensures that symm_mem buffers read by the current kernel are safe
+    for writing by subsequent kernels across all devices.
+        sync_threads()
+        blockwise_barrier(..., sem="relaxed")
+    CUDA graph friendliness:
+        This barrier operates through atomic operations on a zero-filled signal
+        pad, which resets to a zero-filled state after each successful
+        synchronization. This design eliminates the need for incrementing a
+        flag from host.
+    """
+    if block_id is None:
+        block_id = get_flat_bid()
+    flat_tid = get_flat_tid()
+    remote_ranks = tl.arange(0, world_size)
+    signal_pad_ptrs = signal_pad_ptrs.to(tl.pointer_type(tl.uint64))
+    remote_signal_pad_addrs = tl.load(signal_pad_ptrs + remote_ranks).to(
+        tl.pointer_type(tl.uint32)
+    )
+    send_addrs = remote_signal_pad_addrs + block_id * world_size + rank
+    local_signal_pad_addr = tl.load(signal_pad_ptrs + rank).to(
+        tl.pointer_type(tl.uint32)
+    )
+    wait_addrs = local_signal_pad_addr + block_id * world_size + remote_ranks
+    if flat_tid < world_size:
+        send_signal(send_addrs, sem)
+        wait_signal(wait_addrs, sem)

torchtitan/experiments/flux/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (2.08 kB). View file

torchtitan/experiments/flux/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (7.31 kB). View file

torchtitan/experiments/flux/dataset/__pycache__/tokenizer.cpython-312.pyc ADDED Viewed

Binary file (2.21 kB). View file

torchtitan/experiments/flux/dataset/tokenizer.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+from typing import List
+from torchtitan.components.tokenizer import Tokenizer
+from transformers import CLIPTokenizer, T5Tokenizer
+class FluxTokenizer(Tokenizer):
+    """
+    Tokenizing and encoding/decoding text using the T5 or Clip tokenizer.
+    Args:
+        model_path (str): Path to the tokenzier from hugging face.
+    """
+    def __init__(self, model_path: str = "t5-small", max_length: int = 77):
+        super().__init__()
+        self._n_words = 8  # TODO(jianiw): check
+        self._max_length = max_length
+        self.is_clip = model_path.startswith("openai")
+        if self.is_clip:
+            self._tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
+                model_path, max_length=max_length
+            )
+        else:
+            self._tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(
+                model_path, max_length=max_length
+            )
+    def encode(
+        self,
+        s: str,
+    ) -> List[int]:
+        """
+        Encode the prompt text into tokens.
+        """
+        tokens = self._tokenizer(
+            s,
+            truncation=True,
+            max_length=self._max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",  # return pytorch tensors, default return List[int]
+        )["input_ids"]
+        return tokens
+    def decode(self, t: List[int]) -> str:
+        """
+        Decode function. This function will not be called.
+        """
+        return self._tokenizer.decode(t)

torchtitan/experiments/flux/model/__pycache__/hf_embedder.cpython-312.pyc ADDED Viewed

Binary file (1.95 kB). View file

torchtitan/experiments/flux/model/__pycache__/layers.cpython-312.pyc ADDED Viewed

Binary file (17.7 kB). View file

torchtitan/experiments/flux/train_configs/debug_model.toml ADDED Viewed

	@@ -0,0 +1,68 @@

+[job]
+dump_folder = "./outputs"
+description = "Flux debug model"
+print_args = false
+use_for_integration_test = true
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 10
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+[model]
+name = "flux"
+flavor = "flux-debug"
+norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm
+# test tokenizer.model, for debug purpose only
+# tokenizer_path = "./tests/assets/test_tiktoken.model"
+# converters = "float8"
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+[lr_scheduler]
+warmup_steps = 2  # lr scheduler warm up, normally 20% of the train steps
+decay_ratio = 0.8  # lr scheduler decay ratio, 80% of the train steps
+decay_type = "linear"
+lr_min = 0.0
+[training]
+batch_size = 32
+seq_len = 512
+max_norm = 1.0  # grad norm clipping
+steps = 10
+compile = false
+dataset = "cc12m"
+guidance = 3.5
+seed = 0
+[encoder]
+t5_encoder="google/t5-v1_1-small"
+clip_encoder="openai/clip-vit-large-patch14"
+max_t5_encoding_len=512
+auto_encoder_path="torchtitan/experiments/flux/assets/autoencoder/ae.safetensors"  # Autoencoder to use for image
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = 1
+fsdp_reshard_after_forward = "default" # default / never / always
+tensor_parallel_degree = 1
+enable_async_tensor_parallel = false
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+[experimental]
+custom_args_module = "torchtitan.experiments.flux.flux_argparser"

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_backwards.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import logging
+import unittest
+from typing import Tuple
+import torch
+import torch.nn as nn
+from mg_grouped_gemm import (
+    grouped_gemm_backward,
+    grouped_gemm_dw_tma,
+    grouped_gemm_dx_tma,
+    grouped_gemm_forward,
+    mg_grouped_gemm,
+)
+from reference_utils import (
+    analyze_tensor_differences,
+    compute_reference_backward,
+    compute_reference_forward,
+)
+class TestMG_GroupedGEMM_Backward(unittest.TestCase):
+    def setUp(self) -> None:
+        torch.manual_seed(2020)  # Set seed for reproducibility
+    def _run_grouped_gemm_backward_test(
+        self,
+        shape: Tuple[int, int, int, int],
+        device: torch.device,
+        dtype: torch.dtype = torch.bfloat16,
+        atol: float = 1e-5,
+        rtol: float = 1.6e-2,
+    ) -> None:
+        G, M, N, K = shape
+        # Set up inputs for forward pass
+        # In M*G grouping, input is [M*G, K] and weights are [N, K]
+        a = torch.randn(M * G, K, dtype=dtype, device=device, requires_grad=True)
+        b = torch.randn(N, K, dtype=dtype, device=device, requires_grad=True)
+        # Create equal-sized groups for simplicity
+        m_size = M
+        m_sizes = torch.full((G,), m_size, device=device, dtype=torch.int32)
+        # Run forward pass with our implementation
+        result = grouped_gemm_forward(a, b, m_sizes)
+        # Ensure result has correct shape
+        self.assertTrue(result.shape == (M * G, N))
+        # Compute expected result using reference implementation
+        expected_result = compute_reference_forward(a, b, m_sizes)
+        # Verify forward pass correctness
+        forward_close = analyze_tensor_differences(
+            result, expected_result, "Forward output"
+        )
+        self.assertTrue(forward_close)
+        # Create a gradient for backpropagation
+        grad_output = torch.randn_like(result)
+        # Compute gradients using our custom backward implementation
+        grad_a, grad_b = grouped_gemm_backward(grad_output, a, b, m_sizes)
+        # Compute expected gradients using reference implementation
+        expected_grad_a, expected_grad_b = compute_reference_backward(
+            a, b, m_sizes, grad_output
+        )
+        # Verify gradient correctness
+        grad_a_close = analyze_tensor_differences(grad_a, expected_grad_a, "grad_x")
+        grad_b_close = analyze_tensor_differences(grad_b, expected_grad_b, "grad_w")
+        self.assertTrue(grad_a_close)
+        self.assertTrue(grad_b_close)
+    def test_MG_grouped_gemm_backward_bf16(self) -> None:
+        for G in (1, 8, 16):
+            for M in (512, 1024):
+                print(f"Testing BF16 M*G GroupGeMM Backward with G={G}, M={M}")
+                self._run_grouped_gemm_backward_test(
+                    (G, M, 1024, 1024),
+                    torch.device("cuda"),
+                    dtype=torch.float16,
+                    atol=1e-2,
+                    rtol=1e-2,
+                )
+    def test_MG_grouped_gemm_backward_deepseek_shapes(self) -> None:
+        """Test backward pass with shapes from Deepseek model."""
+        deepseek_shapes = [
+            (4, 2048, 4096, 7168),  # G, M, N, K
+            (4, 2048, 7168, 2048),
+            (8, 512, 4096, 7168),
+            (8, 512, 7168, 2048),
+        ]
+        device = torch.device("cuda")
+        for shape in deepseek_shapes:
+            G, M, N, K = shape
+            print(
+                f"Testing BF16 M*G Deepseek Backward shape: G={G}, M={M}, N={N}, K={K}"
+            )
+            self._run_grouped_gemm_backward_test(
+                shape, device, dtype=torch.float16, atol=1e-2, rtol=1e-2
+            )
+    def test_MG_dx(self) -> None:
+        """Test specifically the dx (gradient w.r.t. input) computation."""
+        G, M, N, K = 4, 512, 1024, 2048
+        device = torch.device("cuda")
+        dtype = torch.bfloat16
+        # Set up inputs
+        a = torch.randn(M * G, K, dtype=dtype, device=device, requires_grad=True)
+        b = torch.randn(N, K, dtype=dtype, device=device, requires_grad=True)
+        # Create equal-sized groups
+        m_size = M
+        m_sizes = torch.full((G,), m_size, device=device, dtype=torch.int32)
+        # Forward pass
+        result = grouped_gemm_forward(a, b, m_sizes)
+        # Create gradient for backward
+        grad_output = torch.randn_like(result)
+        # Compute gradient using our optimized function
+        grad_a, _ = grouped_gemm_backward(grad_output, a, b, m_sizes)
+        # Compute expected gradient using reference implementation
+        expected_grad_a, _ = compute_reference_backward(a, b, m_sizes, grad_output)
+        # Verify gradient
+        dx_close = analyze_tensor_differences(grad_a, expected_grad_a, "grad_a (dx)")
+        self.assertTrue(dx_close)
+    def test_MG_dw(self) -> None:
+        """Test specifically the dw (gradient w.r.t. weights) computation."""
+        G, M, N, K = 4, 512, 1024, 2048
+        device = torch.device("cuda")
+        dtype = torch.bfloat16
+        # Set up inputs
+        a = torch.randn(M * G, K, dtype=dtype, device=device, requires_grad=True)
+        b = torch.randn(N, K, dtype=dtype, device=device, requires_grad=True)
+        # Create equal-sized groups
+        m_size = M
+        m_sizes = torch.full((G,), m_size, device=device, dtype=torch.int32)
+        # Forward pass
+        result = grouped_gemm_forward(a, b, m_sizes)
+        # Create gradient for backward
+        grad_output = torch.randn_like(result)
+        # Compute gradient using our optimized function
+        _, grad_b = grouped_gemm_backward(grad_output, a, b, m_sizes)
+        # Compute expected gradient using reference implementation
+        _, expected_grad_b = compute_reference_backward(a, b, m_sizes, grad_output)
+        # Verify gradient
+        dw_close = analyze_tensor_differences(grad_b, expected_grad_b, "grad_b (dw)")
+        self.assertTrue(dw_close)

torchtitan/experiments/llama4/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.66 kB). View file

torchtitan/experiments/llama4/model/__pycache__/moe.cpython-312.pyc ADDED Viewed

Binary file (10.5 kB). View file

torchtitan/experiments/llama4/model/args.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Optional
+from torch import nn
+from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.config_manager import JobConfig
+from torchtitan.protocols.train_spec import BaseModelArgs
+from torchtitan.tools.logging import logger
+@dataclass
+class TransformerModelArgs(BaseModelArgs):
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    rope_theta: float = 10000
+    max_seq_len: int = 2048
+    # If `True`, then each transformer block init uses its layer ID, and if
+    # `False`, each uses the total number of transformer blocks
+    depth_init: bool = True
+    norm_type: str = "rmsnorm"
+    use_flex_attn: bool = False
+    attn_mask_type: str = "causal"
+    eos_id: int = 0
+    # MoE args
+    moe_enabled: bool = True
+    num_experts: int = 8
+    use_shared_expert: bool = True
+    auto_scale_hidden_dim: bool = True
+    # frequency of using MoE layer instead of feedforward layer in a transformer block
+    interleave_moe_layer_step: int = 2
+    # token-choice
+    top_k: int = 1
+    def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
+        self.norm_type = job_config.model.norm_type
+        self.vocab_size = tokenizer.n_words
+        self.max_seq_len = job_config.training.seq_len
+        self.use_flex_attn = job_config.model.use_flex_attn
+    def get_nparams_and_flops(
+        self, model: nn.Module, seq_len: int
+    ) -> tuple[int, float]:
+        nparams_embedding = 0
+        nparams_moe_router = 0
+        nparams_shared_expert = 0
+        nparams_experts = 0
+        nparams_dense = 0
+        for name, p in model.named_parameters():
+            if "embedding" in name:
+                nparams_embedding += p.numel()
+                nparams_dense += p.numel()
+            elif "moe.shared_expert" in name:
+                nparams_shared_expert += p.numel()
+            elif "moe.router" in name:
+                nparams_moe_router += p.numel()
+            elif "moe.experts" in name:
+                nparams_experts += p.numel()
+            else:
+                nparams_dense += p.numel()
+        nparams_sparse = nparams_moe_router + nparams_shared_expert + nparams_experts
+        nparams = nparams_dense + nparams_sparse
+        nparams_sparse_active = (
+            nparams_moe_router
+            + nparams_shared_expert
+            + nparams_experts * self.top_k // self.num_experts
+        )
+        logger.info(
+            f"Total parameter count: dense {nparams_dense:,}, "
+            f"sparse {nparams_sparse:,}, active {nparams_dense + nparams_sparse_active:,}"
+        )
+        l, h, q, t = (
+            self.n_layers,
+            self.n_heads,
+            self.dim // self.n_heads,
+            seq_len,
+        )
+        # Reasoning behind the factor of 12 for the self-attention part of the formula:
+        # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
+        # 2. the flash attention does 1 more matmul recomputation in the backward
+        #    but recomputation should not be counted in calculating MFU           (+0)
+        # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
+        # 4. we follow the convention and do not account for sparsity in causal attention
+        num_flops_per_token = (
+            6 * (nparams_dense - nparams_embedding + nparams_sparse_active)
+            + 12 * l * h * q * t
+        )
+        return nparams, num_flops_per_token

torchtitan/experiments/multimodal/__init__.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from mm_dataset import build_mm_dataloader
+from torchtitan.components.loss import build_cross_entropy_loss
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers
+from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer
+from torchtitan.models.llama3 import parallelize_llama, pipeline_llama
+from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
+from .model import ModelArgs, MultimodalDecoder, VisionEncoder
+__all__ = ["VisionEncoder", "ModelArgs", "MultimodalDecoder"]
+llama4_mm_configs = {
+    # TODO: add configs for llama4 multimodal
+}
+register_train_spec(
+    TrainSpec(
+        name="llama4_multimodal",
+        cls=MultimodalDecoder,
+        config=llama4_mm_configs,
+        parallelize_fn=parallelize_llama,
+        pipelining_fn=pipeline_llama,
+        build_optimizers_fn=build_optimizers,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_mm_dataloader,
+        build_tokenizer_fn=build_tiktoken_tokenizer,
+        build_loss_fn=build_cross_entropy_loss,
+    )
+)

torchtitan/experiments/multimodal/tests/test_multimodal_model.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import pytest
+import torch
+from torchtitan.experiments.llama_multimodal import (
+    ModelArgs,
+    MultimodalDecoder,
+    VisionEncoder,
+)
+from .test_utils import fixed_init_model, fixed_init_tensor
+@pytest.fixture
+def encoder_config():
+    return ModelArgs(
+        encoder_embed_dim=32,
+        encoder_num_layers=2,
+        encoder_num_heads=4,
+        tile_size=49,
+        patch_size=9,
+        max_num_tiles=4,
+        in_channels=3,
+        return_intermediates=[0, 1],
+        num_layers_projection=2,
+        decoder_embed_dim=128,
+    )
+@pytest.fixture
+def decoder_config():
+    return ModelArgs(
+        decoder_embed_dim=512,
+        vocab_size=10000,
+        fusion_interval=2,
+        num_special_tokens=3,
+        decoder_num_layers=6,
+        decoder_num_heads=8,
+        decoder_num_kv_heads=4,
+        max_seq_len=512,
+        rope_theta=50000.0,
+    )
+class TestMultimodalModelVisionEncoder:
+    @pytest.fixture(autouse=True)
+    def setup_class(self, encoder_config):
+        self.model_args = encoder_config
+        self.batch_size = 1
+        self.num_imgs = 2
+        self.num_tiles = 4
+        self.aspect_ratio = torch.tensor([[1, 3], [2, 2]]).reshape(
+            self.batch_size, self.num_imgs, 2
+        )
+        image = torch.rand(
+            (
+                self.batch_size,
+                self.num_imgs,
+                self.num_tiles,
+                self.model_args.in_channels,
+                self.model_args.tile_size,
+                self.model_args.tile_size,
+            )
+        )
+        self.image = fixed_init_tensor(image.shape, min_val=-1, max_val=1)
+    def test_llama_mm_vision_encoder(self):
+        model = VisionEncoder(self.model_args)
+        fixed_init_model(model, min_val=-1, max_val=1)
+        output = model(self.image, self.aspect_ratio)
+        expected_shape = (
+            self.batch_size,
+            self.num_imgs * self.num_tiles * (model.vit.patches_per_tile + 1),
+            self.model_args.decoder_embed_dim,
+        )
+        assert (
+            output.shape == expected_shape
+        ), f"Expected shape {expected_shape}, but got {output.shape}"
+        # TODO: Need to ensure numerical stability before doing convergence test.
+        # output.mean() = 3.994, we need to debug why it is not close to 5.28800, which is
+        # the test value from the original torch tune test
+        # assert torch.allclose(
+        #     output.mean(), torch.tensor(5.28800), atol=1e-3, rtol=1e-3
+        # )
+class TestMultimodalModelDecoder:
+    @pytest.fixture(autouse=True)
+    def setup_class(self, decoder_config):
+        self.model_args = decoder_config
+        self.batch_size = 1
+        self.decoder_embed_dim = self.model_args.decoder_embed_dim
+        self.vocab_size = self.model_args.vocab_size
+        self.seq_len = 128
+        self.input = {
+            "tokens": torch.arange(self.batch_size * self.seq_len).reshape(
+                self.batch_size, self.seq_len
+            ),
+            "encoder_input": fixed_init_tensor(
+                (self.batch_size, self.seq_len, self.decoder_embed_dim),
+                min_val=-1,
+                max_val=1,
+            ),
+            "encoder_mask": None,
+        }
+    @torch.no_grad()
+    def test_llama_mm_decoder(self):
+        model = MultimodalDecoder(self.model_args)
+        fixed_init_model(model, min_val=-1, max_val=1)
+        output = model(**self.input)
+        expected_shape = (self.batch_size, self.seq_len, self.vocab_size)
+        assert (
+            output.shape == expected_shape
+        ), f"Expected shape {expected_shape}, but got {output.shape}"
+        # TODO: Need to ensure numerical stability before doing convergence test.
+        # output.mean() = -0.0134, we need to debug why it is not close to -9.47548e-5, which is
+        # the test value from the original torch tune test
+        # assert torch.allclose(
+        #     output.mean(), torch.tensor(-9.47548e-5), atol=1e-3, rtol=1e-3
+        # )

torchtitan/experiments/simple_fsdp/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.11 kB). View file

torchtitan/models/__pycache__/norms.cpython-312.pyc ADDED Viewed

Binary file (1.39 kB). View file