zaydzuhri commited on Jan 8

Commit

22487a1

verified ·

1 Parent(s): 271d75e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fla/models/delta_net/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/delta_net/__pycache__/configuration_delta_net.cpython-312.pyc +0 -0
fla/models/forgetting_transformer/__pycache__/configuration_forgetting_transformer.cpython-312.pyc +0 -0
fla/models/gla/__pycache__/modeling_gla.cpython-312.pyc +0 -0
fla/models/gla/configuration_gla.py +95 -0
fla/models/linear_attn/__pycache__/modeling_linear_attn.cpython-312.pyc +0 -0
fla/models/retnet/__pycache__/configuration_retnet.cpython-312.pyc +0 -0
fla/models/rwkv6/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/rwkv7/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/samba/__pycache__/configuration_samba.cpython-312.pyc +0 -0
fla/models/samba/__pycache__/modeling_samba.cpython-312.pyc +0 -0
fla/models/transformer/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/transformer_top/__pycache__/__init__.cpython-312.pyc +0 -0
fla/models/transformer_top/__pycache__/configuration_transformer.cpython-312.pyc +0 -0
fla/modules/__pycache__/activations.cpython-312.pyc +0 -0
fla/modules/__pycache__/convolution.cpython-312.pyc +0 -0
fla/modules/__pycache__/fused_bitlinear.cpython-312.pyc +0 -0
fla/modules/__pycache__/mlp.cpython-312.pyc +0 -0
fla/modules/__pycache__/rotary.cpython-312.pyc +0 -0
tb/20260101-0922/wandb/run-20260101_092219--dsmtp.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202601010919/files/requirements.txt +169 -0
tb/20260101-0922/wandb/run-20260101_092219--dsmtp.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202601010919/files/wandb-metadata.json +146 -0
torchtitan/experiments/deepseek_v3/attn_mask_utils.py +397 -0
torchtitan/experiments/deepseek_v3/inference.sh +15 -0
torchtitan/experiments/deepseek_v3/model.py +1325 -0
torchtitan/experiments/deepseek_v3/symm_mem_recipes/__init__.py +11 -0
torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_barrier.py +159 -0
torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_on_device_all_to_all_v.py +260 -0
torchtitan/experiments/deepseek_v3/train.py +142 -0
torchtitan/experiments/flux/README.md +23 -0
torchtitan/experiments/flux/__init__.py +122 -0
torchtitan/experiments/flux/flux_argparser.py +42 -0
torchtitan/experiments/flux/loss.py +27 -0
torchtitan/experiments/flux/parallelize_flux.py +26 -0
torchtitan/experiments/flux/requirements.txt +2 -0
torchtitan/experiments/flux/scripts/download_autoencoder.py +61 -0
torchtitan/experiments/flux/tests/test_flux_dataloader.py +103 -0
torchtitan/experiments/flux/train.py +224 -0
torchtitan/experiments/flux/train_configs/debug_model.toml +68 -0
torchtitan/experiments/flux/utils.py +203 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/__init__.py +13 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/fast_debug_ao.py +299 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/tma_autotuning.py +240 -0
torchtitan/experiments/llama4/README.md +29 -0
torchtitan/experiments/llama4/__pycache__/__init__.cpython-312.pyc +0 -0
torchtitan/experiments/llama4/infra/__pycache__/parallelize_llama.cpython-312.pyc +0 -0
torchtitan/experiments/llama4/infra/expert_parallel.py +145 -0
torchtitan/experiments/llama4/infra/parallelize_llama.py +159 -0
torchtitan/experiments/llama4/model/__pycache__/args.cpython-312.pyc +0 -0
torchtitan/experiments/llama4/model/__pycache__/model.cpython-312.pyc +0 -0
torchtitan/experiments/llama4/model/__pycache__/moe.cpython-312.pyc +0 -0

fla/models/delta_net/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (729 Bytes). View file

fla/models/delta_net/__pycache__/configuration_delta_net.cpython-312.pyc ADDED Viewed

Binary file (3.62 kB). View file

fla/models/forgetting_transformer/__pycache__/configuration_forgetting_transformer.cpython-312.pyc ADDED Viewed

Binary file (2.52 kB). View file

fla/models/gla/__pycache__/modeling_gla.cpython-312.pyc ADDED Viewed

Binary file (18.6 kB). View file

fla/models/gla/configuration_gla.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# -*- coding: utf-8 -*-
+from typing import Dict, Optional
+from transformers.configuration_utils import PretrainedConfig
+class GLAConfig(PretrainedConfig):
+    model_type = 'gla'
+    keys_to_ignore_at_inference = ['past_key_values']
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        expand_k: int = 0.5,
+        expand_v: int = 1,
+        hidden_ratio: Optional[int] = 4,
+        intermediate_size: Optional[int] = None,
+        num_hidden_layers: int = 24,
+        num_heads: int = 4,
+        num_kv_heads: Optional[int] = None,
+        feature_map: Optional[str] = None,
+        attn_mode: str = "chunk",
+        use_short_conv: bool = False,
+        conv_size: int = 4,
+        use_output_gate: bool = True,
+        clamp_min: Optional[float] = None,
+        hidden_act: str = "swish",
+        max_position_embeddings: int = 2048,
+        elementwise_affine: Optional[bool] = True,
+        norm_eps: float = 1e-6,
+        use_gk: bool = True,
+        use_gv: bool = False,
+        attn: Optional[Dict] = None,
+        use_cache: bool = True,
+        pad_token_id: int = None,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = False,
+        initializer_range: float = 0.006,
+        fuse_norm: bool = True,
+        fuse_swiglu: bool = True,
+        fuse_cross_entropy: bool = True,
+        vocab_size: int = 32000,
+        **kwargs
+    ):
+        self.hidden_size = hidden_size
+        self.expand_k = expand_k
+        self.expand_v = expand_v
+        self.hidden_ratio = hidden_ratio
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.feature_map = feature_map
+        self.attn_mode = attn_mode
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.use_output_gate = use_output_gate
+        self.clamp_min = clamp_min
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.elementwise_affine = elementwise_affine
+        self.norm_eps = norm_eps
+        self.use_gk = use_gk
+        self.use_gv = use_gv
+        self.attn = attn
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.fuse_norm = fuse_norm
+        self.fuse_swiglu = fuse_swiglu
+        self.fuse_cross_entropy = fuse_cross_entropy
+        self.vocab_size = vocab_size
+        if attn is not None:
+            if not isinstance(attn, Dict):
+                raise ValueError("attn must be a dictionary")
+            if 'layers' not in attn:
+                raise ValueError("Layer indices must be provided to initialize hybrid attention layers")
+            if 'num_heads' not in attn:
+                raise ValueError("Number of heads must be provided to initialize hybrid attention layers")
+            attn['num_kv_heads'] = attn.get('num_kv_heads', attn['num_heads'])
+            attn['qkv_bias'] = attn.get('qkv_bias', False)
+            attn['window_size'] = attn.get('window_size', None)
+            attn['rope_theta'] = attn.get('rope_theta', 10000.)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

fla/models/linear_attn/__pycache__/modeling_linear_attn.cpython-312.pyc ADDED Viewed

Binary file (18.5 kB). View file

fla/models/retnet/__pycache__/configuration_retnet.cpython-312.pyc ADDED Viewed

Binary file (3.76 kB). View file

fla/models/rwkv6/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (715 Bytes). View file

fla/models/rwkv7/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (715 Bytes). View file

fla/models/samba/__pycache__/configuration_samba.cpython-312.pyc ADDED Viewed

Binary file (3.42 kB). View file

fla/models/samba/__pycache__/modeling_samba.cpython-312.pyc ADDED Viewed

Binary file (20.9 kB). View file

fla/models/transformer/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (756 Bytes). View file

fla/models/transformer_top/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (777 Bytes). View file

fla/models/transformer_top/__pycache__/configuration_transformer.cpython-312.pyc ADDED Viewed

Binary file (2.83 kB). View file

fla/modules/__pycache__/activations.cpython-312.pyc ADDED Viewed

Binary file (23 kB). View file

fla/modules/__pycache__/convolution.cpython-312.pyc ADDED Viewed

Binary file (21.1 kB). View file

fla/modules/__pycache__/fused_bitlinear.cpython-312.pyc ADDED Viewed

Binary file (23.7 kB). View file

fla/modules/__pycache__/mlp.cpython-312.pyc ADDED Viewed

Binary file (6.26 kB). View file

fla/modules/__pycache__/rotary.cpython-312.pyc ADDED Viewed

Binary file (23.2 kB). View file

tb/20260101-0922/wandb/run-20260101_092219--dsmtp.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202601010919/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,169 @@

+flame==0.1.0
+fsspec==2025.10.0
+aiohappyeyeballs==2.6.1
+ipykernel==7.1.0
+smmap==5.0.2
+pybind11==3.0.1
+tabulate==0.9.0
+parso==0.8.5
+yarl==1.22.0
+asttokens==3.0.1
+pandas==2.3.3
+xxhash==3.6.0
+pathvalidate==3.3.1
+Werkzeug==3.1.4
+regex==2025.11.3
+inquirerpy==0.3.4
+click==8.3.1
+idna==3.11
+pydantic==2.12.5
+pexpect==4.9.0
+typepy==1.3.4
+certifi==2025.11.12
+wcwidth==0.2.14
+triton==3.2.0
+hf-xet==1.2.0
+joblib==1.5.3
+tqdm==4.67.1
+nvidia-nvtx-cu12==12.4.127
+setuptools==80.9.0
+lxml==6.0.2
+nvidia-cufft-cu12==11.2.1.3
+evaluate==0.4.6
+Markdown==3.10
+chardet==5.2.0
+multiprocess==0.70.18
+tensorboard==2.20.0
+nvidia-nvjitlink-cu12==12.4.127
+flame==0.1.0
+matplotlib-inline==0.2.1
+Cython==3.2.3
+tensorboard-data-server==0.7.2
+nvidia-cusparse-cu12==12.3.1.170
+lm_eval==0.4.9.1
+pure_eval==0.2.3
+protobuf==6.33.2
+DataProperty==1.1.0
+nvidia-cudnn-cu12==9.1.0.70
+accelerate==1.12.0
+psutil==7.1.3
+Jinja2==3.1.6
+scikit-learn==1.8.0
+nvidia-nccl-cu12==2.21.5
+typing_extensions==4.15.0
+pyzmq==27.1.0
+mpmath==1.3.0
+annotated-types==0.7.0
+propcache==0.4.1
+wandb==0.23.1
+requests==2.32.5
+ipython==9.8.0
+more-itertools==10.8.0
+nvidia-cuda-runtime-cu12==12.4.127
+sacrebleu==2.5.1
+httpx==0.28.1
+huggingface-hub==0.36.0
+MarkupSafe==3.0.3
+nvidia-cusolver-cu12==11.6.1.9
+gitdb==4.0.12
+torchdata==0.11.0
+sentry-sdk==2.48.0
+sympy==1.13.1
+safetensors==0.7.0
+httpcore==1.0.9
+portalocker==3.2.0
+attrs==25.4.0
+typing-inspection==0.4.2
+ptyprocess==0.7.0
+nvidia-cublas-cu12==12.4.5.8
+numexpr==2.14.1
+executing==2.2.1
+networkx==3.6.1
+threadpoolctl==3.6.0
+nvidia-cusparselt-cu12==0.6.2
+einops==0.8.1
+zstandard==0.25.0
+comm==0.2.3
+six==1.17.0
+packaging==25.0
+tqdm-multiprocess==0.0.11
+numpy==2.3.5
+colorama==0.4.6
+nvidia-cuda-cupti-cu12==12.4.127
+jupyter_client==8.7.0
+scipy==1.16.3
+tornado==6.5.4
+nltk==3.9.2
+antlr4-python3-runtime==4.11.0
+jupyter_core==5.9.1
+sqlitedict==2.1.0
+tzdata==2025.3
+pytz==2025.2
+Pygments==2.19.2
+python-dotenv==1.2.1
+cmake==4.2.0
+tiktoken==0.12.0
+PyYAML==6.0.3
+datasets==4.4.1
+pillow==12.0.0
+math-verify==0.8.0
+dill==0.4.0
+nvidia-cuda-nvrtc-cu12==12.4.127
+anyio==4.12.0
+prompt_toolkit==3.0.52
+filelock==3.20.1
+jedi==0.19.2
+frozenlist==1.8.0
+tokenizers==0.21.4
+grpcio==1.76.0
+ninja==1.13.0
+mbstrdecoder==1.1.4
+flash-attn==2.7.3
+aiosignal==1.4.0
+tabledata==1.3.4
+h11==0.16.0
+absl-py==2.3.1
+latex2sympy2_extended==1.10.2
+torch==2.6.0
+nest_asyncio==1.6.0
+pip==25.3
+aiohttp==3.13.2
+pfzy==0.3.4
+platformdirs==4.5.1
+wheel==0.45.1
+peft==0.17.0
+debugpy==1.8.19
+ipython_pygments_lexers==1.1.1
+rouge_score==0.1.2
+multidict==6.7.0
+tcolorpy==0.1.7
+nvidia-curand-cu12==10.3.5.147
+pydantic_core==2.41.5
+pytablewriter==1.2.1
+charset-normalizer==3.4.4
+transformers==4.51.3
+word2number==1.1
+jsonlines==4.0.0
+stack_data==0.6.3
+urllib3==2.6.2
+decorator==5.2.1
+python-dateutil==2.9.0.post0
+pyarrow==22.0.0
+traitlets==5.14.3
+GitPython==3.1.45
+tomli==2.0.1
+more-itertools==10.3.0
+inflect==7.3.1
+zipp==3.19.2
+jaraco.functools==4.0.1
+autocommand==2.2.2
+jaraco.collections==5.1.0
+platformdirs==4.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+jaraco.text==3.12.1
+typing_extensions==4.12.2
+jaraco.context==5.3.0
+typeguard==4.3.0
+packaging==24.2
+wheel==0.45.1

tb/20260101-0922/wandb/run-20260101_092219--dsmtp.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine-202601010919/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,146 @@

+{
+  "os":  "Linux-6.8.0-85-generic-x86_64-with-glibc2.39",
+  "python":  "CPython 3.12.12",
+  "startedAt":  "2026-01-01T09:22:19.321743Z",
+  "args":  [
+    "--job.config_file",
+    "flame/models/fla.toml",
+    "--job.dump_folder",
+    "exp/dsmtp.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine",
+    "--model.config",
+    "configs/dsmtp_transformer_7B.json",
+    "--model.tokenizer_path",
+    "fla-hub/transformer-1.3B-100B",
+    "--optimizer.name",
+    "AdamW",
+    "--optimizer.eps",
+    "1e-15",
+    "--optimizer.lr",
+    "2e-5",
+    "--lr_scheduler.warmup_steps",
+    "400",
+    "--lr_scheduler.lr_min",
+    "0.1",
+    "--lr_scheduler.decay_type",
+    "cosine",
+    "--training.batch_size",
+    "8",
+    "--training.seq_len",
+    "4096",
+    "--training.context_len",
+    "4096",
+    "--training.gradient_accumulation_steps",
+    "2",
+    "--training.steps",
+    "40000",
+    "--training.max_norm",
+    "1.0",
+    "--training.skip_nan_inf",
+    "--training.dataset",
+    "/root/.cache/zaydzuhri___stack-edu-python/default",
+    "--training.dataset_split",
+    "train",
+    "--training.num_workers",
+    "32",
+    "--training.prefetch_factor",
+    "2",
+    "--training.seed",
+    "79",
+    "--training.compile",
+    "--checkpoint.interval",
+    "8000",
+    "--checkpoint.load_step",
+    "-1",
+    "--metrics.log_freq",
+    "5",
+    "--checkpoint.hf_upload_enabled",
+    "--checkpoint.hf_repo_base_name",
+    "zaydzuhri/dsmtp-code-7B-4096-batch8x2-steps40000",
+    "--comm.init_timeout_seconds",
+    "6000",
+    "--comm.train_timeout_seconds",
+    "6000"
+  ],
+  "program":  "-m flame.train",
+  "git":  {
+    "remote":  "https://github.com/zaydzuhri/flame.git",
+    "commit":  "5bcd6b6423606e07b92dd2644ecc24d908d2c7a4"
+  },
+  "email":  "zaydzuhri@gmail.com",
+  "root":  "exp/dsmtp.code.7B.batch8.seqlen4096.context4096.warmup400.update2.steps40000.lr2e-5.cosine/tb/20260101-0922",
+  "host":  "rentals-6z3zwezo0sfapf3y-697b4fc787-gh86h",
+  "executable":  "/root/miniconda3/envs/flame-env/bin/python3.12",
+  "cpu_count":  64,
+  "cpu_count_logical":  128,
+  "gpu":  "NVIDIA H200",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "3246163542016",
+      "used":  "1652645769216"
+    }
+  },
+  "memory":  {
+    "total":  "1913835118592"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA H200",
+      "memoryTotal":  "150754820096",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-bf7aa6f4-2ee0-0ff7-3851-1b40dafcde1f"
+    },
+    {
+      "name":  "NVIDIA H200",
+      "memoryTotal":  "150754820096",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-24e3a14c-3196-7560-5e54-cd031aa25f76"
+    },
+    {
+      "name":  "NVIDIA H200",
+      "memoryTotal":  "150754820096",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-3e484efe-97e7-7b5b-e6b7-d1dc17ed2765"
+    },
+    {
+      "name":  "NVIDIA H200",
+      "memoryTotal":  "150754820096",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-7b9f4a41-11cd-03b7-0065-5e4dab09ddd4"
+    },
+    {
+      "name":  "NVIDIA H200",
+      "memoryTotal":  "150754820096",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-3f34c938-ca85-e68c-f501-59e20f64e14c"
+    },
+    {
+      "name":  "NVIDIA H200",
+      "memoryTotal":  "150754820096",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-9d38c94f-6fc0-6735-7f0d-3e359e2562cb"
+    },
+    {
+      "name":  "NVIDIA H200",
+      "memoryTotal":  "150754820096",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-a7fc49a5-17e0-6e8f-feee-4ffa6e526637"
+    },
+    {
+      "name":  "NVIDIA H200",
+      "memoryTotal":  "150754820096",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-731d225c-6930-54d4-db0f-f53e16eaeb2e"
+    }
+  ],
+  "cudaVersion":  "12.8",
+  "writerId":  "allj4tgslt7j35odul2j6cl35jpfxh7k"
+}

torchtitan/experiments/deepseek_v3/attn_mask_utils.py ADDED Viewed

	@@ -0,0 +1,397 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is based on src/transformers/modeling_attn_mask_utils.py of
+# huggingface/transformers.  It has been modified from its original forms to
+# contain only the necessary utilities.
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+@dataclass
+class AttentionMaskConverter:
+    """
+    A utility attention mask class that allows one to:
+        - Create a causal 4d mask
+        - Create a causal 4d mask with slided window
+        - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length,
+          key_value_length) that can be multiplied with attention scores
+    Examples:
+    ```python
+    >>> import torch
+    >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+    >>> converter = AttentionMaskConverter(True)
+    >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32)
+    tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
+            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
+            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
+            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00, -3.4028e+38],
+            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00,  0.0000e+00]]]])
+    ```
+    Parameters:
+        is_causal (`bool`):
+            Whether the attention mask should be a uni-directional (causal) or bi-directional mask.
+        sliding_window (`int`, *optional*):
+            Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer.
+    """
+    is_causal: bool
+    sliding_window: int
+    def __init__(self, is_causal: bool, sliding_window: Optional[int] = None):
+        self.is_causal = is_causal
+        self.sliding_window = sliding_window
+        if self.sliding_window is not None and self.sliding_window <= 0:
+            raise ValueError(
+                "Make sure that when passing `sliding_window` that its value is a strictly positive integer, "
+                f"not `{self.sliding_window}`"
+            )
+    def to_causal_4d(
+        self,
+        batch_size: int,
+        query_length: int,
+        key_value_length: int,
+        dtype: torch.dtype,
+        device: Union[torch.device, "str"] = "cpu",
+    ) -> Optional[torch.Tensor]:
+        """
+        Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
+        bias to upper right hand triangular matrix (causal mask).
+        """
+        if not self.is_causal:
+            raise ValueError(
+                f"Please use `to_causal_4d` only if {self.__class__} has `is_causal` set to True."
+            )
+        # If shape is not cached, create a new causal mask and cache it
+        input_shape = (batch_size, query_length)
+        past_key_values_length = key_value_length - query_length
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        causal_4d_mask = None
+        if input_shape[-1] > 1 or self.sliding_window is not None:
+            causal_4d_mask = self._make_causal_mask(
+                input_shape,
+                dtype,
+                device=device,
+                past_key_values_length=past_key_values_length,
+                sliding_window=self.sliding_window,
+            )
+        return causal_4d_mask
+    def to_4d(
+        self,
+        attention_mask_2d: torch.Tensor,
+        query_length: int,
+        dtype: torch.dtype,
+        key_value_length: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
+        key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
+        causal, a causal mask will be added.
+        """
+        input_shape = (attention_mask_2d.shape[0], query_length)
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        causal_4d_mask = None
+        if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
+            if key_value_length is None:
+                raise ValueError(
+                    "This attention mask converter is causal. Make sure to pass "
+                    "`key_value_length` to correctly create a causal mask."
+                )
+            past_key_values_length = key_value_length - query_length
+            causal_4d_mask = self._make_causal_mask(
+                input_shape,
+                dtype,
+                device=attention_mask_2d.device,
+                past_key_values_length=past_key_values_length,
+                sliding_window=self.sliding_window,
+            )
+        elif self.sliding_window is not None:
+            raise NotImplementedError(
+                "Sliding window is currently only implemented for causal masking"
+            )
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        expanded_attn_mask = self._expand_mask(
+            attention_mask_2d, dtype, tgt_len=input_shape[-1]
+        ).to(attention_mask_2d.device)
+        if causal_4d_mask is not None:
+            expanded_attn_mask = causal_4d_mask.masked_fill(
+                expanded_attn_mask.bool(), torch.finfo(dtype).min
+            )
+        # expanded_attn_mask + causal_4d_mask can cause some overflow
+        expanded_4d_mask = expanded_attn_mask
+        return expanded_4d_mask
+    @staticmethod
+    def _make_causal_mask(
+        input_ids_shape: torch.Size,
+        dtype: torch.dtype,
+        device: torch.device,
+        past_key_values_length: int = 0,
+        sliding_window: Optional[int] = None,
+    ):
+        """
+        Make causal mask used for bi-directional self-attention.
+        """
+        bsz, tgt_len = input_ids_shape
+        mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+        mask_cond = torch.arange(mask.size(-1), device=device)
+        mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+        mask = mask.to(dtype)
+        if past_key_values_length > 0:
+            mask = torch.cat(
+                [
+                    torch.zeros(
+                        tgt_len, past_key_values_length, dtype=dtype, device=device
+                    ),
+                    mask,
+                ],
+                dim=-1,
+            )
+        # add lower triangular sliding window mask if necessary
+        if sliding_window is not None:
+            diagonal = past_key_values_length - sliding_window - 1
+            context_mask = torch.tril(
+                torch.ones_like(mask, dtype=torch.bool), diagonal=diagonal
+            )
+            mask.masked_fill_(context_mask, torch.finfo(dtype).min)
+        return mask[None, None, :, :].expand(
+            bsz, 1, tgt_len, tgt_len + past_key_values_length
+        )
+    @staticmethod
+    def _expand_mask(
+        mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
+    ):
+        """
+        Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+        """
+        bsz, src_len = mask.size()
+        tgt_len = tgt_len if tgt_len is not None else src_len
+        expanded_mask = (
+            mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+        )
+        inverted_mask = 1.0 - expanded_mask
+        return inverted_mask.masked_fill(
+            inverted_mask.to(torch.bool), torch.finfo(dtype).min
+        )
+    @staticmethod
+    def _unmask_unattended(
+        expanded_mask: torch.FloatTensor,
+        min_dtype: float,
+    ):
+        # fmt: off
+        """
+        Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
+        using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        Details: https://github.com/pytorch/pytorch/issues/110213
+        `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
+        `attention_mask` is [bsz, src_seq_len].
+        The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case
+        of alibi attention bias.
+        For example, if `expanded_mask` is (e.g. here left-padding case)
+        ```
+        [[[[0, 0, 0],
+           [0, 0, 0],
+           [0, 0, 1]]],
+         [[[1, 0, 0],
+           [1, 1, 0],
+           [1, 1, 1]]],
+         [[[0, 0, 0],
+           [0, 1, 0],
+           [0, 1, 1]]]]
+        ```
+        then the modified `expanded_mask` will be
+        ```
+        [[[[1, 1, 1],   <-- modified
+           [1, 1, 1],   <-- modified
+           [0, 0, 1]]],
+         [[[1, 0, 0],
+           [1, 1, 0],
+           [1, 1, 1]]],
+         [[[1, 1, 1],   <-- modified
+           [0, 1, 0],
+           [0, 1, 1]]]]
+        ```
+        """
+        # fmt: on
+        if expanded_mask.dtype == torch.bool:
+            raise ValueError(
+                "AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor."
+            )
+        return expanded_mask.mul(
+            ~torch.all(expanded_mask == min_dtype, dim=-1, keepdim=True)
+        )
+    @staticmethod
+    def _ignore_causal_mask_sdpa(
+        attention_mask: Optional[torch.Tensor],
+        inputs_embeds: torch.Tensor,
+        past_key_values_length: int,
+        sliding_window: Optional[int] = None,
+        is_training: bool = False,
+    ) -> bool:
+        """
+        Detects whether the optional user-specified attention_mask & the automatically created causal mask can be
+        ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument.
+        In case no token is masked in the `attention_mask` argument, if `query_length == 1` or
+        `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks,
+        allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
+        passed).
+        """
+        _, query_length = inputs_embeds.shape[0], inputs_embeds.shape[1]
+        key_value_length = query_length + past_key_values_length
+        is_tracing = (
+            torch.jit.is_tracing()
+            or isinstance(inputs_embeds, torch.fx.Proxy)
+            or is_torchdynamo_compiling()
+        )
+        ignore_causal_mask = False
+        if attention_mask is None:
+            # TODO: When tracing with TorchDynamo with fullgraph=True, the model is recompiled depending on the input
+            # shape, thus SDPA's `is_causal` argument is rightfully updated
+            # (see https://gist.github.com/fxmarty/1313f39037fc1c112508989628c57363). However, when using
+            # `torch.export` or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is
+            # hard-coded. If a user exports a model with q_len > 1, the exported model will hard-code `is_causal=True`
+            # which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108).
+            # Thus, we only set `ignore_causal_mask = True` if the model is set to training.
+            #
+            # Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal`
+            # ("TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor").
+            if (
+                (is_training or not is_tracing)
+                and (query_length == 1 or key_value_length == query_length)
+                and (sliding_window is None or key_value_length < sliding_window)
+            ):
+                ignore_causal_mask = True
+        elif sliding_window is None or key_value_length < sliding_window:
+            if len(attention_mask.shape) == 4:
+                return False
+            elif not is_tracing and torch.all(attention_mask == 1):
+                if query_length == 1 or key_value_length == query_length:
+                    # For query_length == 1, causal attention and bi-directional attention are the same.
+                    ignore_causal_mask = True
+                # Unfortunately, for query_length > 1 and key_value_length != query_length, we cannot generally ignore
+                # the attention mask, as SDPA causal mask generation may be wrong. We will set `is_causal=False` in
+                # SDPA and rely on Transformers attention_mask instead, hence not setting it to None here.
+                # Reference: https://github.com/pytorch/pytorch/issues/108108
+                # TODO: maybe revisit this with https://github.com/pytorch/pytorch/pull/114823 in PyTorch 2.3.
+        return ignore_causal_mask
+def _prepare_4d_causal_attention_mask(
+    attention_mask: Optional[torch.Tensor],
+    input_shape: Union[torch.Size, Tuple, List],
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+    sliding_window: Optional[int] = None,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`
+    Args:
+        attention_mask (`torch.Tensor` or `None`):
+            A 2D attention mask of shape `(batch_size, key_value_length)`
+        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
+            The input shape should be a tuple that defines `(batch_size, query_length)`.
+        inputs_embeds (`torch.Tensor`):
+            The embedded inputs as a torch Tensor.
+        past_key_values_length (`int`):
+            The length of the key value cache.
+        sliding_window (`int`, *optional*):
+            If the model uses windowed attention, a sliding window should be passed.
+    """
+    attn_mask_converter = AttentionMaskConverter(
+        is_causal=True, sliding_window=sliding_window
+    )
+    key_value_length = input_shape[-1] + past_key_values_length
+    # 4d mask is passed through the layers
+    if attention_mask is not None and len(attention_mask.shape) == 2:
+        attention_mask = attn_mask_converter.to_4d(
+            attention_mask,
+            input_shape[-1],
+            key_value_length=key_value_length,
+            dtype=inputs_embeds.dtype,
+        )
+    elif attention_mask is not None and len(attention_mask.shape) == 4:
+        expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
+        if tuple(attention_mask.shape) != expected_shape:
+            raise ValueError(
+                f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
+            )
+        else:
+            # if the 4D mask has correct shape - invert it and fill with negative infinity
+            inverted_mask = 1.0 - attention_mask
+            attention_mask = inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
+            )
+    else:
+        attention_mask = attn_mask_converter.to_causal_4d(
+            input_shape[0],
+            input_shape[-1],
+            key_value_length,
+            dtype=inputs_embeds.dtype,
+            device=inputs_embeds.device,
+        )
+    return attention_mask

torchtitan/experiments/deepseek_v3/inference.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/usr/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+NGPU=${NGPU:-"4"}
+# Get the prompt from command line argument or use a default
+prompt="${1:-What is 2+2?}"
+# Run the model with the prompt
+torchrun --standalone --nproc-per-node ${NGPU} generate.py "$prompt"

torchtitan/experiments/deepseek_v3/model.py ADDED Viewed

	@@ -0,0 +1,1325 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is based on model definition of `deepseek-ai/DeepSeek-V3-Base` on
+# Hugging Face Model Hub. Url:
+# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/modeling_deepseek.py
+# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/resolve/main/configuration_deepseek.py
+#
+# It has been modified from its original forms to accommodate naming convention
+# and usage patterns of the TorchTitan project.
+# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DeepSeek model."""
+import math
+from typing import Optional, Tuple
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from attn_mask_utils import _prepare_4d_causal_attention_mask
+from indices import generate_permute_indices
+from model_config import ModelArgs
+from symm_mem_recipes import OnDeviceAllToAllV
+from torch import nn
+from torch.distributed._functional_collectives import all_to_all_single_autograd
+from torchtitan.experiments.kernels.triton_mg_group_gemm.torchao_pr import (
+    ALIGN_SIZE_M,
+    grouped_gemm_forward,
+)
+# Get model parallel subgroup by name:
+# e.g. "pp", "ep", None
+def get_group(dim_name: Optional[str] = None) -> dist.ProcessGroup:
+    glob = torch.distributed.device_mesh._mesh_resources.get_current_mesh()
+    return glob.get_group(dim_name)
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings,
+            device=self.inv_freq.device,
+            dtype=torch.get_default_dtype(),
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+        freqs = torch.outer(t, self.inv_freq.to(t.device))
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+    ):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+        t = t / self.scaling_factor
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Deepseek
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+    ):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings)
+                - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (
+                base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Inverse dim formula to find dim based on number of rotations
+def yarn_find_correction_dim(
+    num_rotations, dim, base=10000, max_position_embeddings=2048
+):
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+# Find dim range bounds based on rotations
+def yarn_find_correction_range(
+    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+):
+    low = math.floor(
+        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    )
+    high = math.ceil(
+        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    )
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+def yarn_get_mscale(scale=1, mscale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+def yarn_linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+class YarnRotaryEmbedding(RotaryEmbedding):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        original_max_position_embeddings=4096,
+        beta_fast=32,
+        beta_slow=1,
+        mscale=1,
+        mscale_all_dim=0,
+    ):
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = mscale
+        self.mscale_all_dim = mscale_all_dim
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        dim = self.dim
+        freq_extra = 1.0 / (
+            self.base
+            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+        freq_inter = 1.0 / (
+            self.scaling_factor
+            * self.base
+            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+        low, high = yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            dim,
+            self.base,
+            self.original_max_position_embeddings,
+        )
+        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
+            device=device, dtype=torch.float32
+        )
+        inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(seq_len, device=device, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        _mscale = float(
+            yarn_get_mscale(self.scaling_factor, self.mscale)
+            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
+        )
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer(
+            "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False
+        )
+        self.register_buffer(
+            "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False
+        )
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    b, h, s, d = q.shape
+    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    b, h, s, d = k.shape
+    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class MLP(nn.Module):
+    act_fn = nn.SiLU()
+    def __init__(self, config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = (
+            config.intermediate_size if intermediate_size is None else intermediate_size
+        )
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class MoEGate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.scoring_func = config.scoring_func
+        self.seq_aux = config.seq_aux
+        self.topk_method = config.topk_method
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        # topk selection algorithm
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(
+            torch.empty((self.n_routed_experts, self.gating_dim))
+        )
+        if self.topk_method == "noaux_tc":
+            self.e_score_correction_bias = nn.Parameter(
+                # Changed from torch.empty to torch.rand to avoid non-even
+                # distribution for runs without actual weigths
+                torch.rand((self.n_routed_experts))
+            )
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        import torch.nn.init as init
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        # compute gating score
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(
+            hidden_states.type(torch.float32), self.weight.type(torch.float32), None
+        )
+        if self.scoring_func == "sigmoid":
+            scores = logits.sigmoid()
+        elif self.scoring_func == "softmax":
+            scores = logits.softmax(dim=-1, dtype=torch.float32)
+        else:
+            raise NotImplementedError(
+                f"insupportable scoring function for MoE gating: {self.scoring_func}"
+            )
+        # select top-k experts
+        if self.topk_method == "noaux_tc":
+            scores_for_choice = scores.view(
+                bsz * seq_len, -1
+            ) + self.e_score_correction_bias.unsqueeze(0)
+            group_scores = (
+                scores_for_choice.view(bsz * seq_len, self.n_group, -1)
+                .topk(2, dim=-1)[0]
+                .sum(dim=-1)
+            )  # [n, n_group]
+            group_idx = torch.topk(
+                group_scores, k=self.topk_group, dim=-1, sorted=False
+            )[
+                1
+            ]  # [n, top_k_group]
+            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+            score_mask = (
+                group_mask.unsqueeze(-1)
+                .expand(
+                    bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
+                )
+                .reshape(bsz * seq_len, -1)
+            )  # [n, e]
+            tmp_scores = scores_for_choice.masked_fill(
+                ~score_mask.bool(), 0.0
+            )  # [n, e]
+            _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
+            topk_weight = scores.gather(1, topk_idx)
+        elif self.topk_method == "greedy":
+            topk_weight, topk_idx = torch.topk(
+                scores, k=self.top_k, dim=-1, sorted=False
+            )
+        else:
+            raise NotImplementedError(
+                f"insupportable TopK function for MoE gating: {self.topk_method}"
+            )
+        # norm gate to sum 1
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+        topk_weight = (
+            topk_weight * self.routed_scaling_factor
+        )  # must multiply the scaling factor
+        return topk_idx, topk_weight
+class MoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+    # Class attributes:
+    # Two shuffle method supported:
+    # 1. "torch_all_to_all"
+    # 2. "symm_mem" (see `setup_symm_mem` below)
+    shuffle_method = "torch_all_to_all"
+    # Symmetric memory buffers shared by all MoE instances across layers
+    token_send_buf: Optional[torch.Tensor] = None
+    token_gather_buf: Optional[torch.Tensor] = None
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # ep_size is the number of ranks in expert dimension
+        if config.ep_size <= 1:
+            raise ValueError(
+                "For code simplicity, this model only supports distributed experts, "
+                "thus EP size must be > 1, please modify your model config"
+            )
+        self.ep_group = get_group("ep")
+        assert config.ep_size == self.ep_group.size()
+        self.ep_size = config.ep_size
+        self.ep_rank = self.ep_group.rank()
+        self.experts_per_rank = config.n_routed_experts // config.ep_size
+        # Use ModuleDict instead of ModuleList to preserve absoulte expert
+        # IDs while avoiding `None` experts. The absolute expert IDs match
+        # with checkpoint FQNs.
+        self.experts = nn.ModuleDict()
+        for i in range(self.experts_per_rank):
+            abs_expert_id = self.ep_rank * self.experts_per_rank + i
+            self.experts[str(abs_expert_id)] = MLP(
+                config, intermediate_size=config.moe_intermediate_size
+            )
+        self.gate = MoEGate(config)
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = MLP(
+                config=config, intermediate_size=intermediate_size
+            )
+    def combine_experts(self, submod_name):
+        all_weights = []
+        for expert in self.experts.values():
+            lin = expert.get_submodule(submod_name)
+            all_weights.append(lin.weight)
+            lin.weight = None
+        concat_weight = torch.cat(all_weights)
+        self.register_parameter(f"{submod_name}_weight", nn.Parameter(concat_weight))
+    # This function is used to create a symm mem buffer for MoE's. It is for
+    # shuffling tokens fully "on-device", as compared to traditional torch
+    # all_to_all APIs which requrie a GPU-to-CPU sync of the splits.  If a user
+    # calls this function, the `shuffle_method` would switch from
+    # `torch_all_to_all` to `symm_mem`.
+    def setup_symm_mem(self, dtype: torch.dtype, device: torch.device):
+        # Switch shuffle method
+        self.shuffle_method = "symm_mem"
+        # Combine expert weights
+        print("Combining expert weights for Group GEMM")
+        self.combine_experts("gate_proj")
+        self.combine_experts("up_proj")
+        self.combine_experts("down_proj")
+        # Assuming worst case, 2x tokens are routed to one EP rank
+        overflow = 2
+        OnDeviceAllToAllV.max_output_len = (
+            self.config.max_seq_len * self.num_experts_per_tok * overflow
+        )
+        # Symmetric memory buffers are shared by all MoE instances across
+        # layers, we only need to initialize them once
+        if MoE.token_send_buf is not None:
+            return
+        # Input buffer for DP-to-EP shuffle
+        MoE.token_send_buf = symm_mem.empty(
+            self.config.max_seq_len
+            * self.num_experts_per_tok,  # seq len * top k (flattened)
+            self.config.hidden_size,  # hidden dim
+            dtype=dtype,
+            device=device,
+        )
+        # Input buffer for EP-to-DP shuffle
+        MoE.token_gather_buf = symm_mem.empty(
+            self.config.max_seq_len
+            * self.num_experts_per_tok  # seq len * top k (flattened)
+            * overflow,
+            self.config.hidden_size,  # hidden dim
+            dtype=dtype,
+            device=device,
+        )
+        print(f"EP rank [{self.ep_rank}]: Created Symmetric Memory for MoE")
+    def get_send_buf(self):
+        # [Why detach?] During a first forward-backward step, the buffer would
+        # be included in a computational graph. In a second step, autograd will
+        # return an error saying "Trying to backward through the graph a second
+        # time (or directly access saved tensors more than once)". This is
+        # because the buffer is still in the graph, and autograd is trying to
+        # backward through the graph a second time. To avoid this, we detach the
+        # buffer from the graph. `detach()` returns a new tensor, which shares
+        # the same storage with the original one.
+        self.token_send_buf.grad = None
+        return self.token_send_buf.detach()
+    def get_gather_buf(self):
+        # See [Why detach?] in `get_send_buf`
+        self.token_gather_buf.grad = None
+        return self.token_gather_buf.detach()
+    def forward(self, hidden_states):
+        identity = hidden_states
+        orig_shape = hidden_states.shape
+        # for each token, select top-k experts, and compute the weight for each expert
+        topk_idx, topk_weight = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        if self.shuffle_method == "symm_mem":
+            y = self.moe_on_device(hidden_states, topk_idx, topk_weight)
+        else:  # "torch_all_to_all"
+            y = self.moe_forward(hidden_states, topk_idx, topk_weight)
+        y = y.view(*orig_shape)
+        if self.config.n_shared_experts is not None:
+            y = y + self.shared_experts(identity)
+        return y
+    def moe_forward(self, x, topk_ids, topk_weight):
+        # This part sorts the token indices so that tokens routed to the same expert reside consecutively.
+        # An implication is that tokens to the same "expert group" (i.e., device) are also consecutive.
+        # Since this is an "aritificial" index creation (final outcome being
+        # `idxs`), we don't need gradients here.
+        with torch.no_grad():
+            # [seq_len, n_routed_experts]
+            cnts = topk_ids.new_zeros((topk_ids.shape[0], self.config.n_routed_experts))
+            # Fill 1 to the selected experts
+            cnts.scatter_(1, topk_ids, 1)
+            tokens_per_expert = cnts.sum(dim=0)
+            # Token indices for each expert
+            idxs = topk_ids.view(-1).argsort()
+            sorted_tokens_shape = idxs.shape + x.shape[1:]
+        sorted_tokens = x[idxs // topk_ids.shape[1]]
+        assert sorted_tokens.shape == sorted_tokens_shape
+        # This part exchange the information about the number of tokens send and
+        # received by each expert. We can understand this information as "side
+        # band", which is not part of the actual data. Thus no gradient is
+        # needed.
+        with torch.no_grad():
+            # Sum the tokens over local experts, then we get tokens per EP rank,
+            # which is the input splits
+            tokens_per_expert_group = tokens_per_expert.new_empty(
+                tokens_per_expert.shape[0]
+            )
+            dist.all_to_all_single(
+                tokens_per_expert_group, tokens_per_expert, group=self.ep_group
+            )
+            input_splits = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
+        # DP to EP token shuffle. This part needs gradient.
+        if self.shuffle_method == "symm_mem":
+            # Move input to the `token_send_buf` symm mem
+            token_send_buf = self.get_send_buf()
+            token_send_buf[: idxs.shape[0]].copy_(sorted_tokens)
+            # Note: `out=` avoids copy, but it is not differentiable
+            # torch.index_select(x, 0, idxs // topk_ids.shape[1], out=self.token_send_buf[: idxs.shape[0]])
+            token_gather_buf, output_splits = OnDeviceAllToAllV.apply(
+                token_send_buf,
+                input_splits,
+                self.ep_group,
+            )
+            with torch.no_grad():
+                # Received tokens from all other ranks. TODO: use mask instead
+                received = output_splits.sum()
+            # TODO: don't use `received`
+            gathered_tokens = token_gather_buf[:received]
+        else:  # "torch_all_to_all"
+            # Prepare input ans output splits
+            with torch.no_grad():
+                output_splits = tokens_per_expert_group.view(self.ep_size, -1).sum(
+                    dim=1
+                )
+            gathered_tokens = all_to_all_single_autograd(
+                sorted_tokens,
+                output_splits.tolist(),
+                input_splits.tolist(),
+                self.ep_group,
+            )
+        # This part prepares a 1D tensor with the same length as
+        # `gathered_tokens`. The 1D tensor is filled with local expert IDs which
+        # the tokens in `gathered_tokens` are headed for. This part doesn't need
+        # gradient.
+        with torch.no_grad():
+            gatherd_idxs = (
+                torch.arange(
+                    tokens_per_expert_group.numel(),
+                    device=tokens_per_expert_group.device,
+                )
+                % self.experts_per_rank
+            )
+            gatherd_idxs = gatherd_idxs.repeat_interleave(tokens_per_expert_group)
+        # Prepare buffer for tokens processed by experts
+        if self.shuffle_method == "symm_mem":
+            # Take necessary space from `token_gather_buf` symm mem because we are
+            # going to send them out after expert processing
+            processed_tokens = self.get_gather_buf()[: gathered_tokens.shape[0]]
+        else:  # "torch_all_to_all"
+            processed_tokens = torch.empty_like(gathered_tokens)
+        # This part processes the tokens routed to the local experts.
+        # TODO: can we use group GEMM here?
+        for i, expert in enumerate(self.experts.values()):
+            processed_tokens[gatherd_idxs == i] = expert(
+                gathered_tokens[gatherd_idxs == i]
+            )
+        # Now shuffle the tokens back to their original owner, i.e. EP to DP shuffle.
+        # The input/output splits are just a reverse of the previous shuffle.
+        if self.shuffle_method == "symm_mem":
+            token_return_buf, _ = OnDeviceAllToAllV.apply(
+                processed_tokens,
+                output_splits,
+                self.ep_group,
+            )
+            returned_tokens = token_return_buf[: sorted_tokens_shape[0]]
+        else:  # "torch_all_to_all"
+            returned_tokens = all_to_all_single_autograd(
+                processed_tokens,
+                input_splits.tolist(),
+                output_splits.tolist(),
+                self.ep_group,
+            )
+        output_tokens = torch.empty_like(returned_tokens)
+        output_tokens[idxs] = returned_tokens
+        final_out = (
+            output_tokens.view(*topk_ids.shape, -1)
+            .type(topk_weight.dtype)
+            .mul_(topk_weight.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .type(returned_tokens.dtype)
+        )
+        return final_out
+    def moe_on_device(self, x, topk_ids, topk_weight):
+        # This part sorts the token indices so that tokens routed to the same expert reside consecutively.
+        # An implication is that tokens to the same "expert group" (i.e., device) are also consecutive.
+        # Since this is an "aritificial" index creation (final outcome being
+        # `idxs`), we don't need gradients here.
+        with torch.no_grad():
+            # [seq_len, n_routed_experts]
+            cnts = topk_ids.new_zeros((topk_ids.shape[0], self.config.n_routed_experts))
+            # Fill 1 to the selected experts
+            cnts.scatter_(1, topk_ids, 1)
+            tokens_per_expert = cnts.sum(dim=0)
+            # Token indices for each expert
+            idxs = topk_ids.view(-1).argsort()
+            sorted_tokens_shape = idxs.shape + x.shape[1:]
+        sorted_tokens = x[idxs // topk_ids.shape[1]]
+        assert sorted_tokens.shape == sorted_tokens_shape
+        # This part exchange the information about the number of tokens send and
+        # received by each expert. We can understand this information as "side
+        # band", which is not part of the actual data. Thus no gradient is
+        # needed.
+        with torch.no_grad():
+            # Sum the tokens over local experts, then we get tokens per EP rank,
+            # which is the input splits
+            tokens_per_expert_group = tokens_per_expert.new_empty(
+                tokens_per_expert.shape[0]
+            )
+            dist.all_to_all_single(
+                tokens_per_expert_group, tokens_per_expert, group=self.ep_group
+            )
+            input_splits = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
+        # Move input to the `token_send_buf` symm mem
+        token_send_buf = self.get_send_buf()
+        token_send_buf[: idxs.shape[0]].copy_(sorted_tokens)
+        # Note: `out=` avoids copy, but it is not differentiable
+        # torch.index_select(x, 0, idxs // topk_ids.shape[1], out=self.token_send_buf[: idxs.shape[0]])
+        token_gather_buf, output_splits = OnDeviceAllToAllV.apply(
+            token_send_buf,
+            input_splits,
+            self.ep_group,
+        )
+        # We need to permute the received tokens so that tokens for the same expert are contiguous.
+        # This part prepares a 1D tensor `permuted_indices` for such permutation.
+        # This part doesn't need gradient.
+        with torch.no_grad():
+            permuted_indices, m_sizes = generate_permute_indices(
+                tokens_per_expert_group,
+                self.experts_per_rank,
+                self.ep_size,
+                token_gather_buf.shape[0],
+                ALIGN_SIZE_M,
+            )
+        # Permute the received tokens so that tokens for the same expert are contiguous.
+        contig_tokens = token_gather_buf[permuted_indices]
+        # Run the first grouped GEMM
+        w1 = self.get_parameter("gate_proj_weight")
+        gate_proj = grouped_gemm_forward(contig_tokens, w1, m_sizes)
+        # Run the second grouped GEMM
+        w3 = self.get_parameter("up_proj_weight")
+        up_proj = grouped_gemm_forward(contig_tokens, w3, m_sizes)
+        # Apply activation
+        hidden_outputs = MLP.act_fn(gate_proj) * up_proj
+        # Run the third grouped GEMM
+        w2 = self.get_parameter("down_proj_weight")
+        hidden_outputs = grouped_gemm_forward(hidden_outputs, w2, m_sizes)
+        # Prepare buffer for tokens processed by experts
+        # Take necessary space from `token_gather_buf` symm mem because we are
+        # going to send them out after expert processing
+        processed_tokens = self.get_gather_buf()
+        # Move into Symmetric Memory for the return shuffle
+        processed_tokens[permuted_indices] = hidden_outputs
+        # Now shuffle the tokens back to their original owner, i.e. EP to DP shuffle.
+        # The input/output splits are just a reverse of the previous shuffle.
+        token_return_buf, _ = OnDeviceAllToAllV.apply(
+            processed_tokens,
+            output_splits,
+            self.ep_group,
+        )
+        returned_tokens = token_return_buf[: sorted_tokens_shape[0]]
+        output_tokens = torch.empty_like(returned_tokens)
+        output_tokens[idxs] = returned_tokens
+        final_out = (
+            output_tokens.view(*topk_ids.shape, -1)
+            .type(topk_weight.dtype)
+            .mul_(topk_weight.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .type(returned_tokens.dtype)
+        )
+        return final_out
+class Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: ModelArgs, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.kv_lora_rank = config.kv_lora_rank
+        self.v_head_dim = config.v_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
+        self.is_causal = True
+        if self.q_lora_rank is None:
+            self.q_proj = nn.Linear(
+                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
+            )
+        else:
+            self.q_a_proj = nn.Linear(
+                self.hidden_size, config.q_lora_rank, bias=config.attention_bias
+            )
+            self.q_a_layernorm = RMSNorm(config.q_lora_rank)
+            self.q_b_proj = nn.Linear(
+                config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
+            )
+        self.kv_a_proj_with_mqa = nn.Linear(
+            self.hidden_size,
+            config.kv_lora_rank + config.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        self.kv_a_layernorm = RMSNorm(config.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            config.kv_lora_rank,
+            self.num_heads
+            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=config.attention_bias,
+        )
+        self._init_rope()
+        self.softmax_scale = self.q_head_dim ** (-0.5)
+        if self.config.rope_scaling is not None:
+            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+            scaling_factor = self.config.rope_scaling["factor"]
+            if mscale_all_dim:
+                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+                self.softmax_scale = self.softmax_scale * mscale * mscale
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = RotaryEmbedding(
+                self.qk_rope_head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LinearScalingRotaryEmbedding(
+                    self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                    self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "yarn":
+                kwargs = {
+                    key: self.config.rope_scaling[key]
+                    for key in [
+                        "original_max_position_embeddings",
+                        "beta_fast",
+                        "beta_slow",
+                        "mscale",
+                        "mscale_all_dim",
+                    ]
+                    if key in self.config.rope_scaling
+                }
+                self.rotary_emb = YarnRotaryEmbedding(
+                    self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                    **kwargs,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+        q_nope, q_pe = torch.split(
+            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, k_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+        kv = (
+            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            .transpose(1, 2)
+        )
+        k_nope, value_states = torch.split(
+            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
+        )
+        kv_seq_len = value_states.shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
+        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+        if attention_mask is not None:
+            # Attention mask was made 4D because the `attn_weights` above is 4D.
+            # We probably can make this mask smarter if we want to pack sequences
+            # together, instead of using padding. This optimization can be used in
+            # inference. For training, if we want to pack sequences, data loader
+            # will pass in a mask containing such info.
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,  # None, or user provided mask in 2D
+                (bsz, q_len),
+                hidden_states,
+                0,  # past_key_values_length, 0 when training
+            )
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query=query_states,
+            key=key_states,
+            value=value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout,
+            is_causal=attention_mask is None,
+            scale=self.softmax_scale,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class DecoderLayer(nn.Module):
+    def __init__(self, config: ModelArgs, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Attention(config=config, layer_idx=layer_idx)
+        self.mlp = (
+            MoE(config)
+            if (
+                config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0
+            )
+            else MLP(config)
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+Deepseek_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class DeepseekModel(torch.nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DecoderLayer`]
+    Args:
+        config: ModelArgs
+    """
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        # Creating model parts related to my stage
+        assert (
+            config.stage_idx < config.num_stages
+        ), f"Stage {config.stage_idx} is not in the model"
+        print(f"Creating model stage {config.stage_idx} of {config.num_stages}")
+        self.embed_tokens = (
+            nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+            if config.stage_idx == 0
+            else None
+        )
+        self.layers = torch.nn.ModuleDict()
+        division = config.num_hidden_layers // config.num_stages
+        residual = config.num_hidden_layers % config.num_stages
+        # Some earlier stages may have 1 more layer than latter stages because
+        # the division may have residual; this is more even than giving the
+        # entire residual to the last stage.
+        layers_per_stage = [
+            division + 1 if stage < residual else division
+            for stage in range(config.num_stages)
+        ]
+        assert sum(layers_per_stage) == config.num_hidden_layers
+        layer_id_start = sum(layers_per_stage[: config.stage_idx])
+        layer_id_end = layer_id_start + layers_per_stage[config.stage_idx]
+        for layer_id in range(layer_id_start, layer_id_end):
+            self.layers[str(layer_id)] = DecoderLayer(config, layer_id)
+        self.norm = (
+            RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            if config.stage_idx == config.num_stages - 1
+            else None
+        )
+        # Initialize weights and apply final processing
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        # Embedding
+        hidden_states = (
+            self.embed_tokens(tokens) if self.embed_tokens is not None else tokens
+        )
+        # decoder layers
+        for decoder_layer in self.layers.values():
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+            )
+        hidden_states = (
+            self.norm(hidden_states) if self.norm is not None else hidden_states
+        )
+        return hidden_states
+class DeepseekForCausalLM(torch.nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.model = DeepseekModel(config)
+        self.lm_head = (
+            nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+            if config.stage_idx == config.num_stages - 1
+            else None
+        )
+        # Initialize weights and apply final processing
+        # self.post_init()
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Tuple:
+        r"""
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, DeepseekForCausalLM
+        >>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        hidden_states = self.model(
+            tokens,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        logits = (
+            self.lm_head(hidden_states) if self.lm_head is not None else hidden_states
+        )
+        return logits
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        if past_key_values is not None:
+            # Assuming isinstance(past_key_values, Cache):
+            cache_length = past_key_values.get_seq_length()
+            past_length = past_key_values.seen_tokens
+            max_cache_length = past_key_values.get_max_length()
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if (
+                attention_mask is not None
+                and attention_mask.shape[1] > input_ids.shape[1]
+            ):
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx.to(past_state.device))
+                    for past_state in layer_past
+                ),
+            )
+        return reordered_past
+    # Setup Symmetric Memory for MoE token shuffle.
+    # Supports inference currently.
+    def setup_symm_mem(self, dtype: torch.dtype, device: torch.device):
+        for layer in self.model.layers.values():
+            if not isinstance(layer.mlp, MoE):
+                continue
+            layer.mlp.setup_symm_mem(dtype, device)

torchtitan/experiments/deepseek_v3/symm_mem_recipes/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from .triton_on_device_all_to_all_v import OnDeviceAllToAllV
+__all__ = [
+    "OnDeviceAllToAllV",
+]

torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_barrier.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import triton
+import triton.language as tl
+from .triton_utils import get_flat_bid, get_flat_tid
+@triton.jit
+def send_signal(addrs, sem: tl.constexpr):
+    if sem == "relaxed":
+        tl.inline_asm_elementwise(
+            """
+            {
+                .reg .u32   %tmp32_<1>;
+                .reg .pred  %p<1>;
+                send_signal:
+                    atom.global.relaxed.sys.cas.b32 %tmp32_0, [$1], 0, 1;
+                    setp.eq.u32 %p0, %tmp32_0, 0;
+                    @!%p0 bra send_signal;
+            }
+            """,
+            "=r, l",
+            [addrs],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+    elif sem == "acq_rel":
+        tl.inline_asm_elementwise(
+            """
+            {
+                .reg .u32   %tmp32_<1>;
+                .reg .pred  %p<1>;
+                send_signal:
+                    atom.global.release.sys.cas.b32 %tmp32_0, [$1], 0, 1;
+                    setp.eq.u32 %p0, %tmp32_0, 0;
+                    @!%p0 bra send_signal;
+            }
+            """,
+            "=r, l",
+            [addrs],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+    else:
+        raise RuntimeError(f"Unrecognized sem: {sem}")
+@triton.jit
+def wait_signal(addrs, sem: tl.constexpr):
+    if sem == "relaxed":
+        tl.inline_asm_elementwise(
+            """
+            {
+                .reg .u32   %tmp32_<1>;
+                .reg .pred  %p<1>;
+                wait_signal:
+                    atom.global.sys.relaxed.cas.b32 %tmp32_0, [$1], 1, 0;
+                    setp.eq.u32 %p0, %tmp32_0, 1;
+                    @!%p0 bra wait_signal;
+            }
+            """,
+            "=r, l",
+            [addrs],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+    elif sem == "acq_rel":
+        tl.inline_asm_elementwise(
+            """
+            {
+                .reg .u32   %tmp32_<1>;
+                .reg .pred  %p<1>;
+                wait_signal:
+                    atom.global.sys.acquire.cas.b32 %tmp32_0, [$1], 1, 0;
+                    setp.eq.u32 %p0, %tmp32_0, 1;
+                    @!%p0 bra wait_signal;
+            }
+            """,
+            "=r, l",
+            [addrs],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+    else:
+        raise RuntimeError(f"Unrecognized sem: {sem}")
+@triton.jit
+def blockwise_barrier(
+    signal_pad_ptrs,
+    block_id,
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    sem: tl.constexpr,
+):
+    """
+    Synchronizes blocks with matching block_id across participating devices.
+    Note: the function itself is not a system level barrier/fence. It is a
+    building block for expressing different synchronization patterns.
+    Pattern 0: Ensures that all writes to symm_mem buffers from previous
+    kernels across all devices are visible to the current kernel:
+        blockwise_barrier(..., sem="relaxed")
+        sync_threads()
+    Pattern 1: Ensures that all writes to symm_mem buffers from the current
+    block are visible to all remote blocks with matching blockIdx:
+        sync_threads()
+        blockwise_barrier(..., sem="acq_rel")
+        sync_threads()
+    Pattern 2: Ensures that symm_mem buffers read by the current kernel are safe
+    for writing by subsequent kernels across all devices.
+        sync_threads()
+        blockwise_barrier(..., sem="relaxed")
+    CUDA graph friendliness:
+        This barrier operates through atomic operations on a zero-filled signal
+        pad, which resets to a zero-filled state after each successful
+        synchronization. This design eliminates the need for incrementing a
+        flag from host.
+    """
+    if block_id is None:
+        block_id = get_flat_bid()
+    flat_tid = get_flat_tid()
+    remote_ranks = tl.arange(0, world_size)
+    signal_pad_ptrs = signal_pad_ptrs.to(tl.pointer_type(tl.uint64))
+    remote_signal_pad_addrs = tl.load(signal_pad_ptrs + remote_ranks).to(
+        tl.pointer_type(tl.uint32)
+    )
+    send_addrs = remote_signal_pad_addrs + block_id * world_size + rank
+    local_signal_pad_addr = tl.load(signal_pad_ptrs + rank).to(
+        tl.pointer_type(tl.uint32)
+    )
+    wait_addrs = local_signal_pad_addr + block_id * world_size + remote_ranks
+    if flat_tid < world_size:
+        send_signal(send_addrs, sem)
+        wait_signal(wait_addrs, sem)

torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_on_device_all_to_all_v.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+import triton
+import triton.language as tl
+from .triton_barrier import blockwise_barrier
+from .triton_utils import sync_threads
+@triton.jit
+def _exchange_row_offsets(
+    split_sizes_ptrs,
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    BLOCKS_PER_REMOTE_RANK: tl.constexpr,
+):
+    remote_rank = tl.program_id(0) // BLOCKS_PER_REMOTE_RANK
+    # split_sizes_ptr for all ranks
+    # All these vector stacks into split_sizes_matrix
+    split_sizes_ptrs = split_sizes_ptrs.to(tl.pointer_type(tl.uint64))
+    # split_sizes_matrix[remote_rank, :]
+    input_split_sizes_ptr = tl.load(split_sizes_ptrs + remote_rank).to(
+        tl.pointer_type(tl.int64)
+    )
+    offsets_ = tl.arange(0, world_size)
+    input_split_sizes = tl.load(
+        input_split_sizes_ptr + offsets_, mask=offsets_ <= rank, other=0
+    )
+    num_rows = tl.load(input_split_sizes_ptr + rank)
+    input_row_offset = tl.sum(input_split_sizes) - num_rows
+    # split_sizes_matrix[:, rank]
+    output_split_sizes_ptrs = (
+        tl.load(split_sizes_ptrs + offsets_).to(tl.pointer_type(tl.int64)) + rank
+    )
+    output_split_sizes = tl.load(
+        output_split_sizes_ptrs, mask=offsets_ <= remote_rank, other=0
+    )
+    output_row_offset = tl.sum(output_split_sizes) - num_rows
+    return input_row_offset, output_row_offset, num_rows
+@triton.jit
+def on_device_all_to_all_v_kernel(
+    output_ptr,
+    output_splits_ptr,
+    input_ptrs,
+    input_splits_ptr,
+    signal_pad_ptrs,
+    dim: tl.constexpr,  # Separate dim for easier vectorization
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    BLOCKS_PER_REMOTE_RANK: tl.constexpr,
+    UNROLL_FACTOR: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    blockwise_barrier(signal_pad_ptrs, None, rank, world_size, sem="relaxed")
+    sync_threads()
+    remote_rank = tl.program_id(0) // BLOCKS_PER_REMOTE_RANK
+    block_offset = tl.program_id(0) % BLOCKS_PER_REMOTE_RANK
+    input_row_offset, output_row_offset, num_rows = _exchange_row_offsets(
+        input_splits_ptr, rank, world_size, BLOCKS_PER_REMOTE_RANK
+    )
+    output_splits_ptr = output_splits_ptr.to(tl.pointer_type(tl.uint64))
+    if block_offset == 0:
+        # Update output_splits
+        tl.store(output_splits_ptr + remote_rank, num_rows)
+    input_ptr = (
+        tl.load(input_ptrs.to(tl.pointer_type(tl.uint64)) + remote_rank).to(
+            tl.pointer_type(tl.bfloat16)
+        )
+        + input_row_offset * dim
+    )
+    output_ptr = output_ptr + output_row_offset * dim
+    outer_loop_step = BLOCK_SIZE * UNROLL_FACTOR
+    outer_loop_iters_per_rank = tl.cdiv(
+        tl.cdiv(num_rows * dim, outer_loop_step), BLOCKS_PER_REMOTE_RANK
+    )
+    numel_per_rank = outer_loop_step * outer_loop_iters_per_rank
+    offset = numel_per_rank * block_offset
+    end = tl.minimum(numel_per_rank * (block_offset + 1), num_rows * dim)
+    unroll_region_size = (end - offset) // outer_loop_step * outer_loop_step
+    for i in tl.range(offset, offset + unroll_region_size, outer_loop_step):
+        datas = []
+        for j in tl.range(
+            i,
+            i + outer_loop_step,
+            BLOCK_SIZE,
+            loop_unroll_factor=UNROLL_FACTOR,
+        ):
+            offsets = j + tl.arange(0, BLOCK_SIZE)
+            data = tl.load(input_ptr + offsets)
+            tl.store(output_ptr + offsets, data)
+    offset += unroll_region_size
+    while offset < end:
+        offsets = offset + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < num_rows * dim
+        data = tl.load(input_ptr + offsets, mask=mask)
+        tl.store(output_ptr + offsets, data, mask=mask)
+        offset += BLOCK_SIZE
+    sync_threads()
+    blockwise_barrier(signal_pad_ptrs, None, rank, world_size, sem="relaxed")
+    return
+def _on_device_all_to_all_v(
+    output: torch.Tensor,
+    output_splits: torch.Tensor,
+    input: torch.Tensor,
+    input_splits: torch.Tensor,
+    group: dist.ProcessGroup = dist.group.WORLD,
+    BLOCKS_PER_REMOTE_RANK=8,
+    UNROLL_FACTOR: int = 8,
+    BLOCK_SIZE: int = 16384,
+):
+    assert output.dim() == 2, f"{output.shape}"
+    assert input.dim() == 2, f"{input.shape}"
+    assert output.shape[1] == input.shape[1]
+    dim = output.shape[1]
+    input_hdl = symm_mem.rendezvous(input, group=group)
+    input_splits_hdl = symm_mem.rendezvous(input_splits, group=group)
+    num_blocks = input_hdl.world_size * BLOCKS_PER_REMOTE_RANK
+    kernel = on_device_all_to_all_v_kernel[(num_blocks, 1, 1)](
+        output,
+        output_splits,
+        input_hdl.buffer_ptrs_dev,
+        input_splits_hdl.buffer_ptrs_dev,
+        input_hdl.signal_pad_ptrs_dev,
+        dim=dim,
+        rank=input_hdl.rank,
+        world_size=input_hdl.world_size,
+        BLOCKS_PER_REMOTE_RANK=BLOCKS_PER_REMOTE_RANK,
+        UNROLL_FACTOR=UNROLL_FACTOR,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=16,
+    )
+    # log_triton_kernel(kernel)
+    return output
+class OnDeviceAllToAllV(torch.autograd.Function):
+    # A symmetric memory holding the grad_output during backward
+    grad_output_buf = None
+    # A symmetric memory for exchanges split sizes during both forward and backward
+    splits_buf = None
+    # Maximum output length (need to be set before use of OnDeviceAllToAllV)
+    max_output_len = None
+    @staticmethod
+    def forward(
+        ctx,
+        input: torch.Tensor,
+        input_splits: torch.Tensor,
+        group: dist.ProcessGroup = dist.group.WORLD,
+    ):
+        """
+        Args:
+            input: input tensor with data for all ranks concatenated.
+            input_splits: input splits of shape (group.world_size,)
+            group: process group to scope the collective.
+        """
+        # Initialize input splits buffer (one time only)
+        if OnDeviceAllToAllV.splits_buf is None:
+            OnDeviceAllToAllV.splits_buf = symm_mem.empty(
+                *input_splits.shape,
+                dtype=input_splits.dtype,
+                device=input_splits.device,
+            )
+        if OnDeviceAllToAllV.max_output_len is None:
+            raise RuntimeError(
+                "Please set max output length via `OnDeviceAllToAllV.max_output_len = ...`"
+            )
+        # Allocate output buffer
+        output = input.new_empty(OnDeviceAllToAllV.max_output_len, *input.shape[1:])
+        # Allocate output splits tensor
+        output_splits = torch.empty_like(input_splits)
+        # Copy input splits to the buffer
+        OnDeviceAllToAllV.splits_buf.copy_(input_splits)
+        # Shuffle input to output
+        _on_device_all_to_all_v(
+            output, output_splits, input, OnDeviceAllToAllV.splits_buf, group=group
+        )
+        # Output splits in forward is the input splits in backward
+        ctx.save_for_backward(output_splits)
+        ctx.group = group
+        ctx.input_shape = input.shape
+        return output, output_splits
+    @staticmethod
+    def backward(ctx, grad_output, grad_splits):
+        """
+        Backward is implemented as a shuffle of the output's gradients to the input.
+        Args:
+            `grad_output`: output's gradients passed from the downstream.
+            `grad_splits`: unused.
+        """
+        # Initialize grad_output buffer (one time only)
+        if OnDeviceAllToAllV.grad_output_buf is None:
+            assert (
+                OnDeviceAllToAllV.max_output_len is not None
+            ), "`max_output_len` not set"
+            OnDeviceAllToAllV.grad_output_buf = symm_mem.empty(
+                OnDeviceAllToAllV.max_output_len,
+                *grad_output.shape[1:],
+                dtype=grad_output.dtype,
+                device=grad_output.device,
+            )
+        # TODO: is there a way to tell autograd to feed grad_output directly to
+        # our symm_mem buffer?
+        OnDeviceAllToAllV.grad_output_buf.narrow(0, 0, grad_output.shape[0]).copy_(
+            grad_output
+        )
+        # Size info
+        (grad_output_splits,) = ctx.saved_tensors
+        OnDeviceAllToAllV.splits_buf.copy_(grad_output_splits)
+        grad_input_splits = torch.empty_like(grad_output_splits)  # unused
+        grad_input = grad_output.new_empty(*ctx.input_shape)
+        # Shuffle gradients back to the input
+        _on_device_all_to_all_v(
+            grad_input,
+            grad_input_splits,
+            OnDeviceAllToAllV.grad_output_buf,
+            OnDeviceAllToAllV.splits_buf,
+            group=ctx.group,
+        )
+        return grad_input, None, None
+# Alias
+on_device_all_to_all_v = OnDeviceAllToAllV.apply

torchtitan/experiments/deepseek_v3/train.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# torchrun --standalone --nproc-per-node 8 run.py
+import torch
+import torch.distributed as dist
+from checkpoint import load_weights_from_hf
+from model import DeepseekForCausalLM
+from model_config import deepseek_config_registry
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp import fully_shard
+from torch.distributed.pipelining import PipelineStage, Schedule1F1B
+# Use DeepSeek-V2-Lite as a proxy
+model_id = "deepseek-ai/DeepSeek-V2-Lite"
+# Run full model
+def run_full_model(
+    mesh: DeviceMesh,
+):
+    rank = dist.get_rank()
+    device_count = torch.cuda.device_count()
+    device = torch.device("cuda", rank % device_count)
+    pp_mesh = mesh["pp"]
+    ep_mesh = mesh["ep"]
+    pp_rank = pp_mesh.get_local_rank()
+    ep_rank = ep_mesh.get_local_rank()
+    pp_size = pp_mesh.size()
+    ep_size = ep_mesh.size()
+    # Get model configs
+    model_args = deepseek_config_registry[model_id]
+    # [Note]: I am making the model smaller for testing / avoiding OOM. If you
+    # have sufficient GPUs for model parallelism, you can remove this line.
+    model_args.num_hidden_layers = 16
+    # Apply model parallelism
+    model_args.ep_size = ep_size
+    model_args.num_stages = pp_size
+    model_args.stage_idx = pp_rank
+    print(model_args)
+    # Instantiate model
+    with device, mesh:
+        model = DeepseekForCausalLM(model_args)
+    # Load weights
+    load_weights_from_hf(model, model_id, device)
+    model.train()
+    # Apply data parallelism
+    fsdp_mesh = mesh["fsdp"]
+    hsdp_mesh = mesh["ep", "fsdp"]
+    # Using `reshard_after_forward=False` to implement Zero-2, i.e. sharding the
+    # optimizer (Zero-1) and gradients (Zero-2), but not the model weights.
+    # Reason: the MoE is "sparsely activated" compared to the dense model, thus
+    # it will be ineconomical re-gather the weights.
+    for layer in model.model.layers.values():
+        # Apply FSDP to experts
+        if hasattr(layer.mlp, "experts"):
+            for expert in layer.mlp.experts.values():
+                fully_shard(expert, mesh=fsdp_mesh, reshard_after_forward=False)
+        # Apply HSDP to other parts such as attention, layernorm, because they
+        # are doing DDP on EP dimension
+        fully_shard(layer, mesh=hsdp_mesh, reshard_after_forward=False)
+    # Apply HSDP on root model (lm_head, embeddings, etc)
+    fully_shard(model, mesh=hsdp_mesh, reshard_after_forward=False)
+    # Synthetic setting
+    microbatches = pp_size * 2
+    # Use Symmetric Memory for MoE token shuffle.
+    # TODO: we are rewriting `moe_on_device` function. `setup_symm_mem` is
+    # currently supported for forward only. See `generate.py`.
+    # model.setup_symm_mem(torch.bfloat16, device)
+    # Example inputs
+    torch.manual_seed(ep_rank)
+    bs = 4
+    seqlen = 128
+    x = torch.randint(model_args.vocab_size, (microbatches * bs, seqlen), device=device)
+    label = torch.rand(microbatches * bs, seqlen, model_args.vocab_size, device=device)
+    # Create loss function
+    loss_fn = torch.nn.functional.cross_entropy
+    # Run forward and backward
+    steps = 2
+    for _ in range(steps):
+        if pp_size > 1:
+            # Create pipeline stage
+            stage = PipelineStage(
+                model,
+                pp_rank,
+                pp_size,
+                device,
+                group=pp_mesh.get_group(),
+            )
+            # Create pipeline schedule
+            losses = []
+            pp_schedule = Schedule1F1B(stage, microbatches, loss_fn=loss_fn)
+            if pp_rank == 0:
+                y = pp_schedule.step(x)
+            elif pp_rank == pp_size - 1:
+                y = pp_schedule.step(target=label, losses=losses)
+                loss = torch.mean(torch.stack(losses))
+            else:
+                pp_schedule.step()
+        else:
+            y = model(x)
+            loss = loss_fn(y, label)
+            loss.backward()
+        if pp_rank == pp_size - 1:
+            print(f"logits: {y.shape}")
+            print(f"{loss=}")
+        if pp_rank == 0:
+            param = model.get_parameter("model.layers.0.self_attn.q_proj.weight")
+            print(f"{torch.linalg.norm(param.grad)=}")
+        model.zero_grad()
+    print("Backward done")
+if __name__ == "__main__":
+    mesh = dist.init_device_mesh("cuda", (2, 2, 2), mesh_dim_names=("pp", "ep", "fsdp"))
+    run_full_model(mesh)
+    dist.destroy_process_group()

torchtitan/experiments/flux/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# FLUX model in torchtitan
+## Overview
+## Usage
+First, download the autoencoder model from HuggingFace with your own access token:
+```bash
+python torchtitan/experiments/flux/scripts/download_autoencoder.py --repo_id black-forest-labs/FLUX.1-dev --ae_path ae.safetensors --hf_token <your_access_token>
+```
+This step will download the autoencoder model from HuggingFace and save it to the `torchtitan/experiments/flux/assets/autoencoder/ae.safetensors` file.
+Run the following command to train the model on a single GPU:
+```bash
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True torchrun --nproc_per_node=1 torchtitan/experiments/flux/train.py --job.config_file torchtitan/experiments/flux/train_configs/debug_model.toml
+```
+## TODO
+- [ ] Supporting for multiple GPUs is comming soon (FSDP, etc)
+- [ ] Implement test cases in CI for FLUX model. Adding more unit tests for FLUX model (eg, unit test for preprocessor, etc)
+- [ ] More parallesim support (Tensor Parallelism, Context Parallelism, etc)
+- [ ] Support for distributed checkpointing and loading
+- [ ] Implement init_weights() function to initialize the model weights
+- [ ] Implement the num_flops_per_token calculation in get_nparams_and_flops() function

torchtitan/experiments/flux/__init__.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers
+from torchtitan.experiments.flux.dataset.flux_dataset import build_flux_dataloader
+from torchtitan.experiments.flux.loss import build_mse_loss
+from torchtitan.experiments.flux.model.autoencoder import AutoEncoderParams
+from torchtitan.experiments.flux.parallelize_flux import parallelize_flux
+from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
+from .model.model import FluxModel, FluxModelArgs
+__all__ = [
+    "FluxModelArgs",
+    "FluxModel",
+    "flux_configs",
+    "parallelize_flux",
+]
+flux_configs = {
+    "flux-dev": FluxModelArgs(
+        in_channels=64,
+        out_channels=64,
+        vec_in_dim=768,
+        context_in_dim=512,
+        hidden_size=3072,
+        mlp_ratio=4.0,
+        num_heads=24,
+        depth=19,
+        depth_single_blocks=38,
+        axes_dim=(16, 56, 56),
+        theta=10_000,
+        qkv_bias=True,
+        guidance_embed=True,
+        autoencoder_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=(1, 2, 4, 4),
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "flux-schnell": FluxModelArgs(
+        in_channels=64,
+        out_channels=64,
+        vec_in_dim=768,
+        context_in_dim=4096,
+        hidden_size=3072,
+        mlp_ratio=4.0,
+        num_heads=24,
+        depth=19,
+        depth_single_blocks=38,
+        axes_dim=(16, 56, 56),
+        theta=10_000,
+        qkv_bias=True,
+        guidance_embed=False,
+        autoencoder_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=(1, 2, 4, 4),
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "flux-debug": FluxModelArgs(
+        in_channels=64,
+        out_channels=64,
+        vec_in_dim=768,
+        context_in_dim=512,
+        hidden_size=512,
+        mlp_ratio=4.0,
+        num_heads=4,
+        depth=2,
+        depth_single_blocks=2,
+        axes_dim=(16, 56, 56),
+        theta=10_000,
+        qkv_bias=True,
+        guidance_embed=True,
+        autoencoder_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=(1, 2, 4, 4),
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+}
+register_train_spec(
+    TrainSpec(
+        name="flux",
+        cls=FluxModel,
+        config=flux_configs,
+        parallelize_fn=parallelize_flux,
+        pipelining_fn=None,
+        build_optimizers_fn=build_optimizers,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_flux_dataloader,
+        build_tokenizer_fn=None,
+        build_loss_fn=build_mse_loss,
+    )
+)

torchtitan/experiments/flux/flux_argparser.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import torch
+def extend_parser(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        "--training.guidance",
+        type=float,
+        default=3.5,
+        help="guidance value used for guidance distillation",
+    )
+    parser.add_argument(
+        "--encoder.t5_encoder",
+        type=str,
+        default="google/t5-v1_1-small",
+        help="T5 encoder to use, HuggingFace model name.",
+    )
+    parser.add_argument(
+        "--encoder.clip_encoder",
+        type=str,
+        default="openai/clip-vit-large-patch14",
+        help="Clip encoder to use, HuggingFace model name.",
+    )
+    parser.add_argument(
+        "--encoder.encoder_dtype",
+        type=torch.dtype,
+        default=torch.bfloat16,
+        help="Which dtype to load for autoencoder. ",
+    )
+    parser.add_argument(
+        "--encoder.max_t5_encoding_len",
+        type=int,
+        default=512,
+        help="Maximum length of the T5 encoding.",
+    )

torchtitan/experiments/flux/loss.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable, TypeAlias
+import torch
+from torchtitan.config_manager import JobConfig
+from torchtitan.tools.logging import logger
+LossFunction: TypeAlias = Callable[..., torch.Tensor]
+def mse_loss(pred: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+    """Common MSE loss function for Transformer models training."""
+    return torch.nn.functional.mse_loss(pred.float(), labels.float().detach())
+def build_mse_loss(job_config: JobConfig):
+    loss_fn = mse_loss
+    if job_config.training.compile:
+        logger.info("Compiling the loss function with torch.compile")
+        loss_fn = torch.compile(loss_fn)
+    return loss_fn

torchtitan/experiments/flux/parallelize_flux.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This file applies the PT-D parallelisms (except pipeline parallelism) and various
+# training techniques (e.g. activation checkpointing and compile) to the Llama model.
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
+from torchtitan.config_manager import JobConfig
+from torchtitan.distributed import ParallelDims
+def parallelize_flux(
+    model: nn.Module,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+    # TODO: Add model parallel strategy here
+    return model

torchtitan/experiments/flux/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ transformers
2	+ einops

torchtitan/experiments/flux/scripts/download_autoencoder.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional
+from requests.exceptions import HTTPError
+def hf_download(
+    repo_id: str, file_path: str, local_dir: str, hf_token: Optional[str] = None
+) -> None:
+    from huggingface_hub import hf_hub_download
+    try:
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=file_path,
+            local_dir=local_dir,
+            local_dir_use_symlinks=False,
+            token=hf_token,
+        )
+    except HTTPError as e:
+        if e.response.status_code == 401:
+            print(
+                "You need to pass a valid `--hf_token=...` to download private checkpoints."
+            )
+        else:
+            raise e
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Download tokenizer from HuggingFace.")
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="black-forest-labs/FLUX.1-dev",
+        help="Repository ID to download from. default to Flux-dev model",
+    )
+    parser.add_argument(
+        "--ae_path",
+        type=str,
+        default="ae.safetensors",
+        help="the autoencoder path relative to repo_id",
+    )
+    parser.add_argument(
+        "--hf_token", type=str, default=None, help="HuggingFace API token"
+    )
+    parser.add_argument(
+        "--local_dir",
+        type=str,
+        default="torchtitan/experiments/flux/assets/autoencoder/",
+        help="local directory to save the autoencoder",
+    )
+    args = parser.parse_args()
+    hf_download(args.repo_id, args.ae_path, args.local_dir, args.hf_token)

torchtitan/experiments/flux/tests/test_flux_dataloader.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+from torchtitan.config_manager import JobConfig
+from torchtitan.experiments.flux.dataset.flux_dataset import build_flux_dataloader
+from torchtitan.tools.profiling import (
+    maybe_enable_memory_snapshot,
+    maybe_enable_profiling,
+)
+class TestFluxDataLoader:
+    def test_flux_dataloader(self):
+        dataset_name = "cc12m"
+        batch_size = 32
+        world_size = 4
+        rank = 0
+        num_steps = 10
+        path = "torchtitan.experiments.flux.flux_argparser"
+        sys.argv.append(f"--experimental.custom_args_module={path}")
+        config = JobConfig()
+        config.maybe_add_custom_args()
+        config.parse_args(
+            [
+                # Profiling options
+                # "--profiling.enable_profiling",
+                # "--profiling.profile_freq",
+                # "5",
+                # "--profiling.enable_memory_snapshot",
+                # "--profiling.save_memory_snapshot_folder",
+                # "memory_snapshot_flux",
+                "--training.dataset",
+                dataset_name,
+                "--training.batch_size",
+                str(batch_size),
+                "--encoder.t5_encoder",
+                "google/t5-v1_1-small",
+                "--encoder.clip_encoder",
+                "openai/clip-vit-large-patch14",
+                "--encoder.max_t5_encoding_len",
+                "512",
+            ]
+        )
+        with maybe_enable_profiling(
+            config, global_step=0
+        ) as torch_profiler, maybe_enable_memory_snapshot(
+            config, global_step=0
+        ) as memory_profiler:
+            dl = self._build_dataloader(
+                config,
+                world_size,
+                rank,
+            )
+            dl = iter(dl)
+            for i in range(0, num_steps):
+                input_data, labels = next(dl)
+                print(f"Step {i} image size: {labels.shape}")
+                if torch_profiler:
+                    torch_profiler.step()
+                if memory_profiler:
+                    memory_profiler.step()
+                print(len(input_data["clip_tokens"]))
+                for k, v in input_data.items():
+                    print(f"Step {i} {k} value: {type(v), v.shape}")
+                assert len(input_data) == 2  # (clip_encodings, t5_encodings)
+                assert labels.shape == (batch_size, 3, 256, 256)
+                # assert input_data["clip_tokens"].shape[0] == batch_size
+                # assert input_data["t5_tokens"].shape == (batch_size, 512, 512)
+            if torch_profiler:
+                torch_profiler.step()
+            if memory_profiler:
+                memory_profiler.step(exit_ctx=True)
+    def test_preprocess(self):
+        # TODO
+        pass
+    def _build_dataloader(
+        self,
+        job_config,
+        world_size,
+        rank,
+    ):
+        return build_flux_dataloader(
+            dp_world_size=world_size,
+            dp_rank=rank,
+            job_config=job_config,
+            tokenizer=None,
+            infinite=False,
+        )

torchtitan/experiments/flux/train.py ADDED Viewed

	@@ -0,0 +1,224 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+from typing import Optional
+import torch
+from torchtitan.config_manager import JobConfig
+from torchtitan.distributed import utils as dist_utils
+from torchtitan.experiments.flux.model.autoencoder import load_ae
+from torchtitan.experiments.flux.model.hf_embedder import FluxEmbedder
+from torchtitan.experiments.flux.model.model import FluxModel
+from torchtitan.experiments.flux.utils import (
+    create_position_encoding_for_latents,
+    pack_latents,
+    preprocess_flux_data,
+    unpack_latents,
+)
+from torchtitan.tools.logging import init_logger, logger
+from torchtitan.train import Trainer
+class FluxTrainer(Trainer):
+    def __init__(self, job_config: JobConfig):
+        super().__init__(job_config)
+        self.preprocess_fn = preprocess_flux_data
+        # self.dtype = job_config.encoder.dtype
+        self._dtype = torch.bfloat16
+        self._seed = job_config.training.seed
+        self._guidance = job_config.training.guidance
+        # load components
+        model_config = self.train_spec.config[job_config.model.flavor]
+        self.autoencoder = load_ae(
+            job_config.encoder.auto_encoder_path,
+            model_config.autoencoder_params,
+            device="cpu",
+            dtype=self._dtype,
+        )
+        self.clip_encoder = FluxEmbedder(version=job_config.encoder.clip_encoder).to(
+            dtype=self._dtype
+        )
+        self.t5_encoder = FluxEmbedder(version=job_config.encoder.t5_encoder).to(
+            dtype=self._dtype
+        )
+    def _predict_noise(
+        self,
+        model: FluxModel,
+        latents: torch.Tensor,
+        clip_encodings: torch.Tensor,
+        t5_encodings: torch.Tensor,
+        timesteps: torch.Tensor,
+        guidance: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Use Flux's flow-matching model to predict the noise in image latents.
+        Args:
+            model (FluxFlowModel): The Flux flow model.
+            latents (Tensor): Image encodings from the Flux autoencoder.
+                Shape: [bsz, 16, latent height, latent width]
+            clip_encodings (Tensor): CLIP text encodings.
+                Shape: [bsz, 768]
+            t5_encodings (Tensor): T5 text encodings.
+                Shape: [bsz, sequence length, 256 or 512]
+            timesteps (Tensor): The amount of noise (0 to 1).
+                Shape: [bsz]
+            guidance (Optional[Tensor]): The guidance value (1.5 to 4) if guidance-enabled model.
+                Shape: [bsz]
+                Default: None
+            model_ctx (ContextManager): Optional context to wrap the model call (e.g. for activation offloading)
+                Default: nullcontext
+        Returns:
+            Tensor: The noise prediction.
+                Shape: [bsz, 16, latent height, latent width]
+        """
+        bsz, _, latent_height, latent_width = latents.shape
+        POSITION_DIM = 3  # constant for Flux flow model
+        with torch.no_grad():
+            # Create positional encodings
+            latent_pos_enc = create_position_encoding_for_latents(
+                bsz, latent_height, latent_width, POSITION_DIM
+            )
+            text_pos_enc = torch.zeros(bsz, t5_encodings.shape[1], POSITION_DIM)
+            # Convert latent into a sequence of patches
+            latents = pack_latents(latents)
+        # Predict noise
+        latent_noise_pred = model(
+            img=latents,
+            img_ids=latent_pos_enc.to(latents),
+            txt=t5_encodings.to(latents),
+            txt_ids=text_pos_enc.to(latents),
+            y=clip_encodings.to(latents),
+            timesteps=timesteps.to(latents),
+            guidance=guidance.to(latents) if guidance is not None else None,
+        )
+        # Convert sequence of patches to latent shape
+        latent_noise_pred = unpack_latents(
+            latent_noise_pred, latent_height, latent_width
+        )
+        return latent_noise_pred
+    def train_step(self, input_dict: dict[str, torch.Tensor], labels: torch.Tensor):
+        # generate t5 and clip
+        input_dict["image"] = labels
+        input_dict = self.preprocess_fn(
+            device=self.device,
+            dtype=self._dtype,
+            autoencoder=self.autoencoder,
+            clip_encoder=self.clip_encoder,
+            t5_encoder=self.t5_encoder,
+            batch=input_dict,
+            offload=True,
+        )
+        labels = input_dict["img_encodings"]
+        self.optimizers.zero_grad()
+        # Keep these variables local to shorten the code as these are
+        # the major variables that are used in the training loop.
+        model_parts = self.model_parts
+        world_mesh = self.world_mesh
+        parallel_dims = self.parallel_dims
+        # image in latent space transformed by self.auto_encoder
+        clip_encodings = input_dict["clip_encodings"]
+        t5_encodings = input_dict["t5_encodings"]
+        bsz = labels.shape[0]
+        with torch.no_grad():
+            noise = torch.randn_like(labels)
+            timesteps = torch.rand((bsz,)).to(labels)
+            sigmas = timesteps.view(-1, 1, 1, 1)
+            noisy_latents = (1 - sigmas) * labels + sigmas * noise
+            guidance = torch.full((bsz,), self._guidance).to(labels)
+        target = noise - labels
+        assert len(model_parts) == 1
+        # TODO(jianiw): model_parts will be wrapped by FSDP, which will cacluate
+        model_parts[0] = model_parts[0].to(dtype=self._dtype)
+        pred = self._predict_noise(
+            model_parts[0],
+            noisy_latents,
+            clip_encodings,
+            t5_encodings,
+            timesteps,
+            guidance,
+        )
+        loss = self.loss_fn(pred, target)
+        # pred.shape=(bs, seq_len, vocab_size)
+        # need to free to before bwd to avoid peaking memory
+        del (pred, noise, target)
+        loss.backward()
+        dist_utils.clip_grad_norm_(
+            [p for m in model_parts for p in m.parameters()],
+            self.job_config.training.max_norm,
+            foreach=True,
+            pp_mesh=self.world_mesh["pp"] if parallel_dims.pp_enabled else None,
+        )
+        self.checkpointer.maybe_wait_for_staging()
+        self.optimizers.step()
+        self.lr_schedulers.step()
+        # log metrics
+        if not self.metrics_processor.should_log(self.step):
+            return
+        if (
+            parallel_dims.dp_replicate_enabled
+            or parallel_dims.dp_shard_enabled
+            or parallel_dims.cp_enabled
+        ):
+            loss = loss.detach()
+            global_avg_loss, global_max_loss = (
+                dist_utils.dist_mean(loss, world_mesh["dp_cp"]),
+                dist_utils.dist_max(loss, world_mesh["dp_cp"]),
+            )
+        else:
+            global_avg_loss = global_max_loss = loss.item()
+        self.metrics_processor.log(self.step, global_avg_loss, global_max_loss)
+if __name__ == "__main__":
+    init_logger()
+    config = JobConfig()
+    config.maybe_add_custom_args()
+    config.parse_args()
+    trainer: Optional[FluxTrainer] = None
+    try:
+        trainer = FluxTrainer(config)
+        if config.checkpoint.create_seed_checkpoint:
+            assert int(
+                os.environ["WORLD_SIZE"]
+            ), "Must create seed checkpoint using a single device, to disable sharding."
+            assert (
+                config.checkpoint.enable_checkpoint
+            ), "Must enable checkpointing when creating a seed checkpoint."
+            trainer.checkpointer.save(curr_step=0, force=True)
+            logger.info("Created seed checkpoint")
+        else:
+            trainer.train()
+    finally:
+        if trainer:
+            trainer.close()
+        if torch.distributed.is_initialized():
+            torch.distributed.destroy_process_group()
+            logger.info("Process group destroyed.")

torchtitan/experiments/flux/train_configs/debug_model.toml ADDED Viewed

	@@ -0,0 +1,68 @@

+[job]
+dump_folder = "./outputs"
+description = "Flux debug model"
+print_args = false
+use_for_integration_test = true
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 10
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+[model]
+name = "flux"
+flavor = "flux-debug"
+norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm
+# test tokenizer.model, for debug purpose only
+# tokenizer_path = "./tests/assets/test_tiktoken.model"
+# converters = "float8"
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+[lr_scheduler]
+warmup_steps = 2  # lr scheduler warm up, normally 20% of the train steps
+decay_ratio = 0.8  # lr scheduler decay ratio, 80% of the train steps
+decay_type = "linear"
+lr_min = 0.0
+[training]
+batch_size = 32
+seq_len = 512
+max_norm = 1.0  # grad norm clipping
+steps = 10
+compile = false
+dataset = "cc12m"
+guidance = 3.5
+seed = 0
+[encoder]
+t5_encoder="google/t5-v1_1-small"
+clip_encoder="openai/clip-vit-large-patch14"
+max_t5_encoding_len=512
+auto_encoder_path="torchtitan/experiments/flux/assets/autoencoder/ae.safetensors"  # Autoencoder to use for image
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = 1
+fsdp_reshard_after_forward = "default" # default / never / always
+tensor_parallel_degree = 1
+enable_async_tensor_parallel = false
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+[experimental]
+custom_args_module = "torchtitan.experiments.flux.flux_argparser"

torchtitan/experiments/flux/utils.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional
+import torch
+from torch import Tensor
+from torchtitan.experiments.flux.model.autoencoder import AutoEncoder
+from torchtitan.experiments.flux.model.hf_embedder import FluxEmbedder
+def preprocess_flux_data(
+    # arguments from the recipe
+    device: torch.device,
+    dtype: torch.dtype,
+    *,
+    # arguments from the config
+    autoencoder: Optional[AutoEncoder],
+    clip_encoder: FluxEmbedder,
+    t5_encoder: FluxEmbedder,
+    batch: dict[str, Tensor],
+    offload: bool = False,
+) -> dict[str, Tensor]:
+    """
+    Take a batch of inputs and encoder as input and return a batch of preprocessed data.
+    Args:
+        device (torch.device): device to do preprocessing on
+        dtype (torch.dtype): data type to do preprocessing in
+        autoencoer(AutoEncoder): autoencoder to use for preprocessing
+        clip_encoder
+        t5_encoder
+        batch (dict[str, Tensor]): batch of data to preprocess
+    Returns:
+        dict[str, Tensor]: batch of preprocessed data
+    """
+    # The input of encoder should be torch.int type
+    if offload:
+        clip_encoder.to(device)
+        t5_encoder.to(device)
+        if autoencoder is not None:
+            autoencoder.to(device)
+    clip_tokens = batch["clip_tokens"].squeeze().to(device=device, dtype=torch.int)
+    t5_tokens = batch["t5_tokens"].squeeze().to(device=device, dtype=torch.int)
+    clip_text_encodings = clip_encoder(clip_tokens)
+    t5_text_encodings = t5_encoder(t5_tokens)
+    if autoencoder is not None:
+        images = batch["image"].to(device=device, dtype=dtype)
+        img_encodings = autoencoder.encode(images)
+        batch["img_encodings"] = img_encodings.to(device=device, dtype=dtype)
+    batch["clip_encodings"] = clip_text_encodings.to(dtype)
+    batch["t5_encodings"] = t5_text_encodings.to(dtype)
+    # offload encoders to cpu after preprocessing
+    if offload:
+        clip_encoder.to("cpu")
+        t5_encoder.to("cpu")
+        if autoencoder is not None:
+            autoencoder.to("cpu")
+    return batch
+def generate_noise_latent(
+    bsz: int,
+    height: int,
+    width: int,
+    device: str | torch.device,
+    dtype: torch.dtype,
+    seed: int,
+) -> Tensor:
+    """Generate noise latents for the Flux flow model.
+    Args:
+        bsz (int): batch_size.
+        height (int): The height of the image.
+        width (int): The width of the image.
+        device (str | torch.device): The device to use.
+        dtype (torch.dtype): The dtype to use.
+        seed (int): The seed to use for randomize.
+    Returns:
+        Tensor: The noise latents.
+            Shape: [num_samples, LATENT_CHANNELS, height // IMG_LATENT_SIZE_RATIO, width // IMG_LATENT_SIZE_RATIO]
+    """
+    LATENT_CHANNELS, IMAGE_LATENT_SIZE_RATIO = 16, 8
+    return torch.randn(
+        bsz,
+        LATENT_CHANNELS,
+        height // IMAGE_LATENT_SIZE_RATIO,
+        width // IMAGE_LATENT_SIZE_RATIO,
+        dtype=dtype,
+        generator=torch.Generator().manual_seed(seed),
+    ).to(device)
+def create_position_encoding_for_latents(
+    bsz: int, latent_height: int, latent_width: int, position_dim: int = 3
+) -> Tensor:
+    """
+    Create the packed latents' position encodings for the Flux flow model.
+    Args:
+        bsz (int): The batch size.
+        latent_height (int): The height of the latent.
+        latent_width (int): The width of the latent.
+    Returns:
+        Tensor: The position encodings.
+            Shape: [bsz, (latent_height // PATCH_HEIGHT) * (latent_width // PATCH_WIDTH), POSITION_DIM)
+    """
+    PATCH_HEIGHT, PATCH_WIDTH = 2, 2
+    height = latent_height // PATCH_HEIGHT
+    width = latent_width // PATCH_WIDTH
+    position_encoding = torch.zeros(height, width, position_dim)
+    row_indices = torch.arange(height)
+    position_encoding[:, :, 1] = row_indices.unsqueeze(1)
+    col_indices = torch.arange(width)
+    position_encoding[:, :, 2] = col_indices.unsqueeze(0)
+    # Flatten and repeat for the full batch
+    # [height, width, 3] -> [bsz, height * width, 3]
+    position_encoding = position_encoding.view(1, height * width, position_dim)
+    position_encoding = position_encoding.repeat(bsz, 1, 1)
+    return position_encoding
+def pack_latents(x: Tensor) -> Tensor:
+    """
+    Rearrange latents from an image-like format into a sequence of patches.
+    Equivalent to `einops.rearrange("b c (h ph) (w pw) -> b (h w) (c ph pw)")`.
+    Args:
+        x (Tensor): The unpacked latents.
+            Shape: [bsz, ch, latent height, latent width]
+    Returns:
+        Tensor: The packed latents.
+            Shape: (bsz, (latent_height // ph) * (latent_width // pw), ch * ph * pw)
+    """
+    PATCH_HEIGHT, PATCH_WIDTH = 2, 2
+    b, c, latent_height, latent_width = x.shape
+    h = latent_height // PATCH_HEIGHT
+    w = latent_width // PATCH_WIDTH
+    # [b, c, h*ph, w*ph] -> [b, c, h, w, ph, pw]
+    x = x.unfold(2, PATCH_HEIGHT, PATCH_HEIGHT).unfold(3, PATCH_WIDTH, PATCH_WIDTH)
+    # [b, c, h, w, ph, PW] -> [b, h, w, c, ph, PW]
+    x = x.permute(0, 2, 3, 1, 4, 5)
+    # [b, h, w, c, ph, PW] -> [b, h*w, c*ph*PW]
+    return x.reshape(b, h * w, c * PATCH_HEIGHT * PATCH_WIDTH)
+def unpack_latents(x: Tensor, latent_height: int, latent_width: int) -> Tensor:
+    """
+    Rearrange latents from a sequence of patches into an image-like format.
+    Equivalent to `einops.rearrange("b (h w) (c ph pw) -> b c (h ph) (w pw)")`.
+    Args:
+        x (Tensor): The packed latents.
+            Shape: (bsz, (latent_height // ph) * (latent_width // pw), ch * ph * pw)
+        latent_height (int): The height of the unpacked latents.
+        latent_width (int): The width of the unpacked latents.
+    Returns:
+        Tensor: The unpacked latents.
+            Shape: [bsz, ch, latent height, latent width]
+    """
+    PATCH_HEIGHT, PATCH_WIDTH = 2, 2
+    b, _, c_ph_pw = x.shape
+    h = latent_height // PATCH_HEIGHT
+    w = latent_width // PATCH_WIDTH
+    c = c_ph_pw // (PATCH_HEIGHT * PATCH_WIDTH)
+    # [b, h*w, c*ph*pw] -> [b, h, w, c, ph, pw]
+    x = x.reshape(b, h, w, c, PATCH_HEIGHT, PATCH_WIDTH)
+    # [b, h, w, c, ph, pw] -> [b, c, h, ph, w, pw]
+    x = x.permute(0, 3, 1, 4, 2, 5)
+    # [b, c, h, ph, w, pw] -> [b, c, h*ph, w*pw]
+    return x.reshape(b, c, h * PATCH_HEIGHT, w * PATCH_WIDTH)

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from .mg_grouped_gemm import grouped_gemm_forward
+from .tma_autotuning import ALIGN_SIZE_M
+__all__ = [
+    "grouped_gemm_forward",
+    "ALIGN_SIZE_M",
+]

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/fast_debug_ao.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import logging
+import numpy as np
+import torch
+from reference_utils import (
+    analyze_tensor_differences,
+    compute_reference_backward,
+    compute_reference_forward,
+)
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+# Import grouped GEMM implementations
+try:
+    from mg_grouped_gemm import grouped_gemm_backward, grouped_gemm_forward
+except ImportError:
+    logging.error(
+        "Error importing grouped GEMM modules. Make sure the implementation files are in the correct path."
+    )
+    raise
+def test_forward_pass():
+    """
+    A simple test for the M*G grouped GEMM forward pass with detailed error handling.
+    In M*G grouping:
+    - M dimension is partitioned into G groups (M_total = sum(M_sizes))
+    - N dimension is the same for all groups
+    """
+    try:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Test parameters for DeepSeek-like models
+        G = 1  # Number of groups
+        M_sizes = [
+            2048,
+        ]  # 2048, 2048, 2048]  # Group sizes (will be adjusted)
+        M_total = sum(M_sizes)  # Total M dimension
+        N = 4096  # Output dimension (same for all groups)
+        K = 7168  # Hidden dimension
+        # Create group sizes tensor
+        m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+        # Create input and weight tensors - using float16 for higher precision
+        x = torch.randn(M_total, K, dtype=torch.float16, device=device)
+        w = torch.randn(N, K, dtype=torch.float16, device=device)
+        # Log the setup
+        logging.info(f"Test setup - G: {G}, M_total: {M_total}, N: {N}, K: {K}")
+        logging.info(f"Group sizes: {m_sizes}")
+        logging.info(f"Input x shape: {x.shape}")
+        logging.info(f"Weight w shape: {w.shape}")
+        # Run forward pass
+        logging.info("Running forward pass with grouped GEMM")
+        result = grouped_gemm_forward(x, w, m_sizes)
+        logging.info(f"Forward result shape: {result.shape}")
+        # Compute reference result
+        logging.info("Computing reference result with PyTorch")
+        reference_result = compute_reference_forward(x, w, m_sizes)
+        # Compare results
+        logging.info("Comparing with PyTorch reference")
+        forward_close = analyze_tensor_differences(
+            result, reference_result, "Forward output"
+        )
+        return forward_close
+    except Exception as e:
+        logging.error(f"Test failed with error: {e}")
+        import traceback
+        logging.error(traceback.format_exc())
+        return False
+def test_backward_pass():
+    """
+    A simple test for the M*G grouped GEMM backward pass with detailed error handling.
+    In M*G grouping:
+    - M dimension is partitioned into G groups (M_total = sum(M_sizes))
+    - N dimension is the same for all groups
+    """
+    try:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Test parameters for DeepSeek-like models
+        G = 4  # Number of groups
+        M_sizes = [2048, 2048, 2048, 2048]  # Group sizes (will be adjusted)
+        M_total = sum(M_sizes)  # Total M dimension
+        N = 4096  # Output dimension (same for all groups)
+        K = 7168  # Hidden dimension
+        # Create group sizes tensor
+        m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+        # Create input and weight tensors - using float16 for higher precision
+        x = torch.randn(
+            M_total, K, dtype=torch.float16, device=device, requires_grad=True
+        )
+        w = torch.randn(N, K, dtype=torch.float16, device=device, requires_grad=True)
+        # Log the setup
+        logging.info(f"Test setup - G: {G}, M_total: {M_total}, N: {N}, K: {K}")
+        logging.info(f"Group sizes: {m_sizes}")
+        logging.info(f"Input x shape: {x.shape}")
+        logging.info(f"Weight w shape: {w.shape}")
+        # Step 1: Run forward pass
+        logging.info("Running forward pass")
+        result = grouped_gemm_forward(x, w, m_sizes)
+        logging.info(f"Forward result shape: {result.shape}")
+        # Create a gradient for backpropagation
+        grad_output = torch.randn_like(result)
+        logging.info(f"Created gradient with shape: {grad_output.shape}")
+        # Step 2: Run backward pass directly
+        logging.info("Running backward pass directly")
+        grad_x, grad_w = grouped_gemm_backward(grad_output, x, w, m_sizes)
+        # Verify gradient shapes
+        logging.info(
+            f"Gradient shapes - grad_x: {grad_x.shape}, grad_w: {grad_w.shape}"
+        )
+        # Step 3: Verify gradient computation using PyTorch's autograd
+        logging.info("Running PyTorch reference implementation")
+        # Compute reference gradients
+        x_ref_grad, w_ref_grad = compute_reference_backward(x, w, m_sizes, grad_output)
+        # Compare gradients
+        logging.info("Comparing gradients with PyTorch reference")
+        grad_x_close = analyze_tensor_differences(grad_x, x_ref_grad, "grad_x")
+        grad_w_close = analyze_tensor_differences(grad_w, w_ref_grad, "grad_w")
+        # Log overall result
+        if grad_x_close and grad_w_close:
+            logging.info("✓ SUCCESS: Gradients match the PyTorch reference")
+        else:
+            logging.error("✗ FAILURE: Gradient mismatch detected")
+        return grad_x_close and grad_w_close
+    except Exception as e:
+        logging.error(f"Test failed with error: {e}")
+        import traceback
+        logging.error(traceback.format_exc())
+        return False
+def test_multiple_deepseek_configs():
+    """
+    Test multiple DeepSeek model configurations with both forward and backward pass verification.
+    """
+    # DeepSeek configurations: (G, M, K, N)
+    configs = [
+        (4, 8192, 7168, 4096),  # Config 1
+        (4, 8192, 2048, 7168),  # Config 2
+        (8, 4096, 7168, 4096),  # Config 3
+        (8, 4096, 2048, 7168),  # Config 4
+    ]
+    results = []
+    for config_idx, (G, M, K, N) in enumerate(configs):
+        logging.info(f"\n\n===== Testing DeepSeek Config {config_idx+1} =====")
+        logging.info(f"G={G}, M={M}, K={K}, N={N}")
+        try:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            # Create even group sizes
+            base_size = M // G
+            remainder = M % G
+            M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+            m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+            # Create input and weight tensors using float16 for higher precision
+            x = torch.randn(
+                M, K, dtype=torch.float16, device=device, requires_grad=True
+            )
+            w = torch.randn(
+                N, K, dtype=torch.float16, device=device, requires_grad=True
+            )
+            logging.info(f"Input x shape: {x.shape}, Weight w shape: {w.shape}")
+            # Run forward pass
+            result = grouped_gemm_forward(x, w, m_sizes)
+            logging.info(f"Forward result shape: {result.shape}")
+            # ===== FORWARD PASS VERIFICATION =====
+            # Compute reference forward result
+            reference_result = compute_reference_forward(x, w, m_sizes)
+            # Compare forward results
+            forward_close = analyze_tensor_differences(
+                result, reference_result, "Forward output"
+            )
+            # ===== BACKWARD PASS VERIFICATION =====
+            # Create gradient for backpropagation
+            grad_output = torch.randn_like(result)
+            # Run backward pass
+            grad_x, grad_w = grouped_gemm_backward(grad_output, x, w, m_sizes)
+            # Compute reference gradients
+            x_ref_grad, w_ref_grad = compute_reference_backward(
+                x, w, m_sizes, grad_output
+            )
+            # Compare backward results
+            grad_x_close = analyze_tensor_differences(grad_x, x_ref_grad, "grad_x")
+            grad_w_close = analyze_tensor_differences(grad_w, w_ref_grad, "grad_w")
+            # Overall config result
+            backward_close = grad_x_close and grad_w_close
+            config_success = forward_close and backward_close
+            results.append(
+                (config_idx + 1, config_success, forward_close, backward_close)
+            )
+            # Log overall config result
+            if config_success:
+                logging.info(f"✓ SUCCESS: Config {config_idx+1} passed all tests!")
+            else:
+                logging.error(
+                    f"✗ FAILURE: Config {config_idx+1} failed one or more tests"
+                )
+        except Exception as e:
+            logging.error(f"Config {config_idx+1} test failed with error: {e}")
+            import traceback
+            logging.error(traceback.format_exc())
+            results.append((config_idx + 1, False, False, False))
+    # Summary
+    logging.info("\n===== Test Results Summary =====")
+    for config_idx, overall_success, forward_success, backward_success in results:
+        overall_status = "✓ PASSED" if overall_success else "✗ FAILED"
+        forward_status = "✓ PASSED" if forward_success else "✗ FAILED"
+        backward_status = "✓ PASSED" if backward_success else "✗ FAILED"
+        logging.info(f"Config {config_idx}: {overall_status}")
+        logging.info(f"  - Forward pass: {forward_status}")
+        logging.info(f"  - Backward pass: {backward_status}")
+    return all(overall_success for _, overall_success, _, _ in results)
+if __name__ == "__main__":
+    logging.info(
+        "Running verification for both forward and backward pass of M*G grouped GEMM"
+    )
+    # Run basic forward pass test
+    logging.info("\n===== Running basic forward pass test =====")
+    success_forward = test_forward_pass()
+    logging.info(f"Basic forward test {'succeeded' if success_forward else 'failed'}")
+    # Run basic backward pass test
+    logging.info("\n===== Running basic backward pass test =====")
+    success_backward = test_backward_pass()
+    logging.info(f"Basic backward test {'succeeded' if success_backward else 'failed'}")
+    # Run multiple DeepSeek configs with forward and backward verification
+    logging.info("\n===== Running tests for all DeepSeek configs =====")
+    success_configs = test_multiple_deepseek_configs()
+    logging.info(
+        f"DeepSeek configs tests {'all succeeded' if success_configs else 'had failures'}"
+    )
+    # Overall result
+    overall_success = success_forward and success_backward and success_configs
+    logging.info(
+        f"\nOverall test result: {'SUCCESS' if overall_success else 'FAILURE'}"
+    )

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/tma_autotuning.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# credit - TMAHelper class, AutoTuning are derived from FBGemm:
+# https://github.com/pytorch/FBGEMM/blob/main/fbgemm_gpu/experimental/gemm/triton_gemm
+# pyre-unsafe
+import functools
+import os
+import sys
+from typing import Any, Dict, Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from triton import Config as TConfig
+from triton.runtime import driver  # @manual
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# ===== Supporting utils, CUDA and TMA =====
+class CudaUtils:
+    @staticmethod
+    def is_cuda() -> bool:
+        """Check if Triton is running on CUDA backend."""
+        return driver.active.get_current_target().backend == "cuda"
+    @staticmethod
+    def verify_tma() -> bool:
+        """Check if TMA is supported on the current device."""
+        return (
+            CudaUtils.is_cuda()
+            and torch.cuda.is_available()
+            and torch.cuda.get_device_capability()[0] >= 9
+        )
+    @staticmethod
+    def get_num_sms() -> int:
+        """Get the number of streaming multiprocessors on the current device."""
+        if not CudaUtils.is_cuda():
+            raise RuntimeError("Triton is not running on CUDA backend")
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available")
+        return torch.cuda.get_device_properties("cuda").multi_processor_count
+class TmaDescriptorHelper:
+    """Helper class for managing TMA descriptors in Triton kernels."""
+    class KernelParamWrapper:
+        """Wrapper to implement the TmaDescKernelParam interface."""
+        def __init__(self, desc: torch.Tensor):
+            self.desc = desc
+        def tma_desc_cpu_ptr(self) -> int:
+            """Return the CPU pointer to the TMA descriptor."""
+            return self.desc.data_ptr()
+    def __init__(self, tma_size: int = 128):
+        """Initialize the TMA descriptor helper.
+        Args:
+            tma_size: Size of the TMA descriptor in bytes
+        """
+        if not CudaUtils.verify_tma():
+            raise RuntimeError(
+                "TMA not supported on this device (requires Hopper or newer)"
+            )
+        if "nv_tma_desc_type" not in dir(tl):
+            raise RuntimeError(
+                "TMA grid constant descriptors not supported in your Triton version"
+            )
+        self.tma_size = tma_size
+        self.fill_1d_tma_descriptor_inner = driver.active.utils.fill_1d_tma_descriptor
+        self.fill_2d_tma_descriptor_inner = driver.active.utils.fill_2d_tma_descriptor
+        self.descriptors: Dict[str, torch.Tensor] = {}
+    def init_tma_descriptor(self, name: str) -> None:
+        """Initialize a TMA descriptor with the given name.
+        Call this method outside of the lambda function for grid size.
+        """
+        self.descriptors[name] = torch.empty(
+            self.tma_size, device="cpu", dtype=torch.int8
+        )
+    def fill_1d_tma_descriptor(
+        self, name: str, ptr: int, dim: int, block_dim: int, element_size: int
+    ) -> None:
+        """Fill a 1D TMA descriptor.
+        Call this method inside the lambda function for grid size.
+        """
+        if name not in self.descriptors:
+            raise ValueError(f"TMA descriptor '{name}' not initialized")
+        desc_x = self.descriptors[name]
+        if desc_x.data_ptr() % 64 != 0:
+            raise ValueError("TMA descriptor must be 64-byte aligned")
+        self.fill_1d_tma_descriptor_inner(
+            ptr, dim, block_dim, element_size, desc_x.data_ptr()
+        )
+    def fill_2d_tma_descriptor(
+        self,
+        name: str,
+        ptr: int,
+        dim1: int,
+        dim0: int,
+        block_dim1: int,
+        block_dim0: int,
+        element_size: int,
+    ) -> None:
+        """Fill a 2D TMA descriptor.
+        Call this method inside the lambda function for grid size.
+        """
+        if name not in self.descriptors:
+            raise ValueError(f"TMA descriptor '{name}' not initialized")
+        desc_x = self.descriptors[name]
+        if desc_x.data_ptr() % 64 != 0:
+            raise ValueError("TMA descriptor must be 64-byte aligned")
+        self.fill_2d_tma_descriptor_inner(
+            ptr, dim1, dim0, block_dim1, block_dim0, element_size, desc_x.data_ptr()
+        )
+    def get_tma_descriptor_kernel_param(self, name: str) -> KernelParamWrapper:
+        """Get the TMA descriptor kernel parameter for the given name."""
+        if name not in self.descriptors or self.descriptors[name] is None:
+            raise ValueError(f"TMA descriptor '{name}' not initialized")
+        return self.KernelParamWrapper(self.descriptors[name])
+# ======  Autotuning utilities ======
+ALIGN_SIZE_M = 128
+_NV_CONFIGS = [
+    triton.Config(
+        {
+            "BLOCK_SIZE_M": block_size_m,
+            "BLOCK_SIZE_N": block_size_n,
+            "BLOCK_SIZE_K": block_size_k,
+        },
+        num_stages=num_stages,
+        num_warps=num_warps,
+        num_ctas=num_ctas,
+    )
+    for block_size_m in [ALIGN_SIZE_M, ]
+    for block_size_n in [64, 128, 256]
+    for block_size_k in [64, 128, 256]
+    for num_stages in [3, 4]
+    for num_warps in [4, 8]
+    for num_ctas in [1]
+]
+def early_config_prune(configs, named_args, dtsize=None, dtype=None, **kwargs):
+    device = torch.cuda.current_device()
+    # Check for all possible pointer parameter names
+    if "grad_input_ptr" in named_args:
+        ptr_name = "grad_input_ptr"
+    elif "c_ptr" in named_args:
+        ptr_name = "c_ptr"
+    elif "grad_weight_ptr" in named_args:
+        ptr_name = "grad_weight_ptr"
+    else:
+        raise KeyError("No recognized pointer parameter found in kernel arguments")
+    if dtsize is None:
+        dtsize = named_args[ptr_name].element_size()
+    if dtype is None:
+        dtype = named_args[ptr_name].dtype
+    pruned_configs = []
+    for config in configs:
+        kw = config.kwargs
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages = (
+            kw["BLOCK_SIZE_M"],
+            kw["BLOCK_SIZE_N"],
+            kw["BLOCK_SIZE_K"],
+            config.num_stages,
+        )
+        G, M, N, K = (
+            named_args["G"],
+            named_args["M_BUCKET"],
+            named_args["N"],
+            named_args["K"],
+        )
+        # 1. make sure we have enough smem
+        max_shared_memory = driver.active.utils.get_device_properties(device)[
+            "max_shared_mem"
+        ]
+        required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+        if required_shared_memory > max_shared_memory:
+            continue
+        M_PER_GROUP = M // G
+        MIN_M_TILES = 64
+        # 2. make sure we don't load M tiles that are too big
+        if BLOCK_M > MIN_M_TILES and BLOCK_M > (M_PER_GROUP * 2):
+            continue
+        # 3. make sure we don't load N tiles that are too small
+        if BLOCK_M < 128 and BLOCK_M < (M_PER_GROUP // 2):
+            continue
+        num_sm = driver.active.utils.get_device_properties(device)[
+            "multiprocessor_count"
+        ]
+        N_TILES = N // BLOCK_N
+        MIN_N_TILES = 64
+        # 4. make sure we don't load N tiles that are too big
+        if BLOCK_N > MIN_N_TILES and M * N_TILES < num_sm:
+            continue
+        # 5. make sure we don't load N tiles that are too small
+        if BLOCK_N < 128 and M * N_TILES > 2 * num_sm:
+            continue
+        # 6. make sure K can be evenly divided
+        if K % BLOCK_K != 0:
+            continue
+        pruned_configs.append(config)
+    return pruned_configs
+# ======== End Autotuning utilities ========

torchtitan/experiments/llama4/README.md ADDED Viewed

	@@ -0,0 +1,29 @@

+**The Llama 4 folder is still under development.**
+#### Available features
+- Llama 4 model definition (text-only), including the MoE architecture with token-choice routing
+- Basic FSDP, TP, PP, CP support
+- DCP checkpoint conversion scripts
+#### Download Llama 4 tokenizer
+```bash
+# Llama 4 tokenizer.model
+python scripts/download_tokenizer.py --repo_id meta-llama/Llama-4-Scout-17B-16E --tokenizer_path "" --hf_token=...
+```
+#### To be added
+- Modeling
+    - iRoPE implementation
+    - load balance loss for token-choice MoE
+    - alternative expert-choice MoE
+    - multimodal support
+- Kernel integration
+    - efficient bfloat16 GroupedGEMM kernels (from PyTorch core)
+    - efficient float8 GroupedGEMM kernels (from torchao)
+- Parallelism
+    - performant TP implementation and torch.compile support for MoE layers
+    - Context Parallel support for FlexAttention, iRoPE, and multimodal inputs
+    - Expert Parallel support
+- Testing
+    - perfomance and loss converging tests
+    - CI integration

torchtitan/experiments/llama4/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.66 kB). View file

torchtitan/experiments/llama4/infra/__pycache__/parallelize_llama.cpython-312.pyc ADDED Viewed

Binary file (5.55 kB). View file

torchtitan/experiments/llama4/infra/expert_parallel.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from functools import partial
+from typing import Optional, Tuple
+import torch.nn as nn
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_module,
+    distribute_tensor,
+    DTensor,
+    Partial,
+    Replicate,
+    Shard,
+)
+from torch.distributed.tensor.parallel import ParallelStyle
+from torch.distributed.tensor.placement_types import Placement
+# implementation of Tensor Parallel on the non-shared experts in MoE
+class TensorParallel(ParallelStyle):
+    def __init__(
+        self,
+        *,
+        input_layouts: Optional[Tuple[Optional[Placement]]] = None,
+        output_layout: Optional[Placement] = None,
+        use_local_output: bool = True,
+    ):
+        super().__init__()
+        self.input_layouts = input_layouts or (Replicate(), None)
+        self.output_layout = output_layout or Partial()
+        self.desired_input_layouts = (Replicate(), None)
+        self.use_local_output = use_local_output
+    @staticmethod
+    def _prepare_input_fn(
+        input_layouts, desired_input_layouts, mod, inputs, device_mesh
+    ):
+        # TODO: figure out dynamo support for instance method and switch this to instance method
+        # annotate module input placements/sharding with input_layouts
+        input_tensor, input_layout, desired_input_layout = (
+            inputs[0],
+            input_layouts[0],
+            desired_input_layouts[0],
+        )
+        if not isinstance(input_tensor, DTensor):
+            input_tensor = DTensor.from_local(
+                input_tensor, device_mesh, (input_layout,), run_check=False
+            )
+        if input_layouts != desired_input_layouts:
+            input_tensor = input_tensor.redistribute(
+                placements=(desired_input_layout,), async_op=True
+            )
+        return (input_tensor, *inputs[1:])
+    def _partition_fn(self, name, module, device_mesh):
+        module.register_parameter(
+            "w1", nn.Parameter(distribute_tensor(module.w1, device_mesh, [Shard(2)]))
+        )  # Column-wise sharding
+        module.register_parameter(
+            "w2",
+            nn.Parameter(distribute_tensor(module.w2, device_mesh, [Shard(1)])),
+        )  # Row-wise sharding
+        module.register_parameter(
+            "w3",
+            nn.Parameter(distribute_tensor(module.w3, device_mesh, [Shard(2)])),
+        )  # Column-wise sharding
+    @staticmethod
+    def _prepare_output_fn(output_layout, use_local_output, mod, outputs, device_mesh):
+        if outputs.placements != (output_layout,):
+            outputs = outputs.redistribute(placements=(output_layout,), async_op=True)
+        # back to local tensor
+        return outputs.to_local() if use_local_output else outputs
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        return distribute_module(
+            module,
+            device_mesh,
+            self._partition_fn,
+            partial(
+                self._prepare_input_fn, self.input_layouts, self.desired_input_layouts
+            ),
+            partial(self._prepare_output_fn, self.output_layout, self.use_local_output),
+        )
+# NOTE: This is to achieve replicate computation on the gate module in the MoE router.
+# It does nothing other than (1) setting the module parameters as DTensors on the given mesh
+# and (2) inserting hooks to module boundary to change torch.Tensor to DTensor and back.
+# TODO: The reason we need this wrapping is to ensure all parameters are on the same 1D/2D mesh,
+# which is assumed by (1) gradient norm clipping, and (2) optimizer fused implementation.
+class NoParallel(ParallelStyle):
+    def __init__(
+        self,
+        *,
+        input_layout: Optional[Placement] = None,
+        output_layout: Optional[Placement] = None,
+        use_local_output: bool = True,
+    ):
+        super().__init__()
+        self.input_layout = input_layout or Replicate()
+        self.output_layout = output_layout or Replicate()
+        self.desired_input_layout = Replicate()
+        self.use_local_output = use_local_output
+    @staticmethod
+    def _prepare_input_fn(input_layout, desired_input_layout, mod, inputs, device_mesh):
+        # annotate module input placements/sharding with input_layouts
+        input_tensor = inputs[0]
+        if not isinstance(input_tensor, DTensor):
+            input_tensor = DTensor.from_local(
+                input_tensor, device_mesh, (input_layout,), run_check=False
+            )
+        if input_layout != desired_input_layout:
+            input_tensor = input_tensor.redistribute(
+                placements=(desired_input_layout,), async_op=True
+            )
+        return (input_tensor, *inputs[1:])
+    @staticmethod
+    def _prepare_output_fn(output_layout, use_local_output, mod, outputs, device_mesh):
+        if outputs.placements != (output_layout,):
+            outputs = outputs.redistribute(placements=(output_layout,), async_op=True)
+        # back to local tensor
+        return outputs.to_local() if use_local_output else outputs
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        return distribute_module(
+            module,
+            device_mesh,
+            None,
+            partial(
+                self._prepare_input_fn, self.input_layout, self.desired_input_layout
+            ),
+            partial(self._prepare_output_fn, self.output_layout, self.use_local_output),
+        )

torchtitan/experiments/llama4/infra/parallelize_llama.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
+from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP
+from torchtitan.distributed import ParallelDims
+from torchtitan.models.llama3.parallelize_llama import (
+    apply_ac,
+    apply_compile,
+    apply_ddp,
+    apply_fsdp,
+    apply_tp,
+)
+from torchtitan.tools.logging import logger
+def parallelize_llama(
+    model: nn.Module,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+    """
+    Apply tensor parallelism, activation checkpointing, torch.compile, and data
+    parallelism to the model.
+    NOTE: The passed-in model preferably should be on meta device. Otherwise,
+    the model must fit on GPU or CPU memory.
+    """
+    if parallel_dims.tp_enabled:
+        if (
+            job_config.parallelism.enable_async_tensor_parallel
+            and not job_config.training.compile
+        ):
+            raise RuntimeError("Async TP requires --training.compile")
+        enable_float8_linear = "float8" in job_config.model.converters
+        float8_is_rowwise = job_config.float8.recipe_name in (
+            "rowwise",
+            "rowwise_with_gw_hp",
+        )
+        # For now, float8 all-gather with TP is only supported for tensorwise
+        # float8 scaling recipes. For rowwise recipes, we use regular TP and
+        # all-gather happens in high precision.
+        enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise
+        apply_tp(
+            model,
+            world_mesh["tp"],
+            loss_parallel=parallel_dims.loss_parallel_enabled,
+            enable_float8_tensorwise_tp=enable_float8_tensorwise_tp,
+            enable_async_tp=job_config.parallelism.enable_async_tensor_parallel,
+        )
+        apply_moe_tp(model, world_mesh["tp"])
+    if job_config.activation_checkpoint.mode != "none":
+        if (
+            job_config.activation_checkpoint.mode == "selective"
+            and job_config.model.use_flex_attn
+        ):
+            raise ValueError(
+                "FlexAttention is not compatible with selective AC yet. "
+                "See https://github.com/pytorch/pytorch/issues/147879"
+            )
+        apply_ac(model, job_config.activation_checkpoint)
+    # turn on per-TransformerBlock compile after AC wrapping and before FSDP
+    if job_config.training.compile:
+        apply_compile(model)
+        # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE
+        torch._dynamo.config.capture_scalar_outputs = True
+    if (
+        parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled
+    ):  # apply FSDP or HSDP, potentially with Context Parallel
+        if parallel_dims.dp_replicate_enabled:
+            dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+        else:
+            dp_mesh_dim_names = ("dp_shard_cp",)
+        apply_fsdp(
+            model,
+            world_mesh[tuple(dp_mesh_dim_names)],
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
+            pp_enabled=parallel_dims.pp_enabled,
+            cpu_offload=job_config.training.enable_cpu_offload,
+            reshard_after_forward_policy=job_config.parallelism.fsdp_reshard_after_forward,
+        )
+        if parallel_dims.dp_replicate_enabled:
+            logger.info("Applied HSDP to the model")
+        else:
+            logger.info("Applied FSDP to the model")
+        if parallel_dims.cp_enabled:
+            logger.info("Applied Context Parallel to the model")
+        if job_config.training.enable_cpu_offload:
+            logger.info("Applied CPU Offloading to the model")
+    elif parallel_dims.dp_replicate_enabled:
+        if world_mesh.ndim > 1:
+            raise RuntimeError("DDP has not supported > 1D parallelism")
+        apply_ddp(
+            model,
+            world_mesh,
+            enable_compile=job_config.training.compile,
+            enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
+        )
+    return model
+def apply_moe_tp(
+    model: nn.Module,
+    tp_mesh: DeviceMesh,
+):
+    from torch.distributed.tensor import Partial, Replicate, Shard
+    from torch.distributed.tensor.parallel import (
+        parallelize_module,
+        PrepareModuleInputOutput,
+    )
+    from .expert_parallel import NoParallel, TensorParallel
+    for _, transformer_block in model.layers.items():
+        moe_layer_plan = {
+            # input / output sharding on the seqlen dim
+            # all-gather for input, reduce-scatter for output
+            "moe": PrepareModuleInputOutput(
+                input_layouts=(Shard(1),),
+                desired_input_layouts=(Replicate(),),
+                use_local_input=True,
+                output_layouts=(Partial(),),
+                desired_output_layouts=(Shard(1),),
+            ),
+            # replicate computation for the router
+            "moe.router.gate": NoParallel(),
+            # input Replicate, output Partial
+            "moe.experts": TensorParallel(),
+            "moe.shared_expert": TensorParallel(),
+        }
+        parallelize_module(
+            module=transformer_block,
+            device_mesh=tp_mesh,
+            parallelize_plan=moe_layer_plan,
+        )

torchtitan/experiments/llama4/model/__pycache__/args.cpython-312.pyc ADDED Viewed

Binary file (4.1 kB). View file

torchtitan/experiments/llama4/model/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (23.2 kB). View file

torchtitan/experiments/llama4/model/__pycache__/moe.cpython-312.pyc ADDED Viewed

Binary file (10.5 kB). View file