zaydzuhri commited on Jul 14, 2025

Commit

7c7a501

verified ·

1 Parent(s): 801b04c

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +21 -0
README.md +471 -0
config.json +34 -0
configs/delta_net_1B.json +29 -0
configs/delta_net_340M.json +27 -0
configs/gla_340M.json +24 -0
configs/gla_7B.json +25 -0
configs/gsa_340M.json +29 -0
configs/mtp_transformer_340M.json +19 -0
configs/top_transformer_1B.json +24 -0
configs/top_transformer_340M.json +20 -0
configs/transformer_120M.json +18 -0
configs/transformer_7B.json +21 -0
fla/utils.py +223 -0
generation_config.json +6 -0
logs/none_ro0qpaac/attempt_0/0/stderr.log +0 -0
logs/none_ro0qpaac/attempt_0/1/stderr.log +0 -0
logs/none_ro0qpaac/attempt_0/3/stderr.log +0 -0
logs/none_ro0qpaac/attempt_0/5/stderr.log +0 -0
logs/none_ro0qpaac/attempt_0/6/stderr.log +0 -0
logs/none_ro0qpaac/attempt_0/7/stderr.log +0 -0
pyproject.toml +43 -0
setup.py +51 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +44 -0
torchtitan/components/__pycache__/checkpoint.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/dataloader.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/float8.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/metrics.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/optimizer.cpython-312.pyc +0 -0
torchtitan/components/__pycache__/tokenizer.cpython-312.pyc +0 -0
torchtitan/distributed/utils.py +311 -0
torchtitan/experiments/deepseek_v3/README.md +40 -0
torchtitan/experiments/deepseek_v3/indices.py +195 -0
torchtitan/experiments/deepseek_v3/symm_mem_recipes/__init__.py +11 -0
torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_on_device_all_to_all_v.py +260 -0
torchtitan/experiments/deepseek_v3/train.py +142 -0
torchtitan/experiments/flux/README.md +23 -0
torchtitan/experiments/flux/dataset/flux_dataset.py +267 -0
torchtitan/experiments/flux/dataset/tokenizer.py +64 -0
torchtitan/experiments/flux/model/hf_embedder.py +40 -0
torchtitan/experiments/flux/model/layers.py +286 -0
torchtitan/experiments/flux/tests/test_flux_dataloader.py +103 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/fast_debug_ao.py +299 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/mg_grouped_gemm.py +1304 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/reference_utils.py +126 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/tma_autotuning.py +240 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py +82 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023-2025 Songlin Yang, Yu Zhang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,471 @@

+<div align="center">
+# 🔥 Flame: Flash Linear Attention Made Easy
+</div>
+Welcome to 🔥 `flame`, a minimal and efficient framework built on `torchtitan` for training Flash Linear Attention (FLA) models (and more broadly, arbitrary autoregressive language models) with blazing efficiency.
+**Feature Highlights:**
+- 🚀 Minimal, easy-to-use, extensible training framework
+- 🤗 Seamless integration with `fla` and `transformers`
+- 🔄 Zero-cost data preprocessing: online tokenization, dataset shuffling, and multiple datasets support
+- 🔮 4D parallelism (coming soon)
+## Setup
+To get started, clone the `flame` repository and install the required dependencies:
+```bash
+git clone https://github.com/fla-org/flame.git
+cd flame
+pip install .
+```
+`flame` manages minimal dependencies, only including `fla` and `torchtitan` as submodules.
+After installation, initialize and update the submodules:
+```sh
+git submodule update --init --recursive
+```
+## Dataset Preparation
+To download the dataset to your local disk, create a new Python file with the following content and execute it:
+```py
+from datasets import load_dataset
+# load fineweb-edu with parallel processing
+dataset = load_dataset("HuggingFaceFW/fineweb-edu", name="default", num_proc=64, cache_dir="/your/cache/path")
+# or load a subset with roughly 100B tokens, suitable for small- or medium-sized experiments
+dataset = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-100BT", num_proc=64, cache_dir="/your/cache/path")
+```
+## Training Recipes
+Here's an example of training a 340M FLA Transformer model with a LLaMA-like architecture from scratch on a 100BT subset of the Fineweb-edu corpus in streaming mode.
+> [!WARNING]
+> If the dataset is not downloaded beforehand, the streaming mode will attempt to fetch it from a remote server and download it on-the-fly, which can be highly unstable during training due to network issues.
+> For stable training, ensure the dataset is downloaded locally (see [**Dataset Preparation**](#dataset-preparation)). Otherwise, we assume you are only testing the new corpus.
+```sh
+bash train.sh \
+  --job.config_file flame/models/fla.toml \
+  --job.dump_folder exp/transformer-340M-4K-10B/batch1.seqlen65536.context4096.warmup1024.update1.steps20480.lr3e-4.cosine \
+  --model.config configs/transformer_340M.json \
+  --model.tokenizer_path fla-hub/transformer-1.3B-100B \
+  --optimizer.name AdamW \
+  --optimizer.eps 1e-15 \
+  --optimizer.lr 3e-4 \
+  --lr_scheduler.warmup_steps 1024 \
+  --lr_scheduler.lr_min 0.1 \
+  --lr_scheduler.decay_type cosine \
+  --training.batch_size 1 \
+  --training.seq_len 65536 \
+  --training.context_len 4096 \
+  --training.varlen \
+  --training.gradient_accumulation_steps 1 \
+  --training.steps 20480 \
+  --training.max_norm 1.0 \
+  --training.skip_nan_inf \
+  --training.dataset HuggingFaceFW/fineweb-edu \
+  --training.dataset_name sample-100BT \
+  --training.dataset_split train \
+  --training.streaming \
+  --training.num_workers 32 \
+  --training.prefetch_factor 2 \
+  --training.seed 42 \
+  --training.compile \
+  --checkpoint.interval 2048 \
+  --checkpoint.load_step -1 \
+  --checkpoint.keep_latest_k 2 \
+  --metrics.log_freq 1
+```
+You can specify the number of GPUs by setting the environment variable `NGPU`, which defaults to 8.
+**For single-GPU debugging, set `NGPU=1`.**
+We provide several [config files](https://github.com/fla-org/flame/tree/main/configs) for different models.
+By default, the learning rate is set to 3e-4 with a cosine scheduler. Other schedulers, such as WSD (wsd), are also supported.
+**Key parameters:**
+- `--lr_scheduler.decay_ratio`: The proportion of the steps allocated to the decay phase. The learning rate will remain stable after the warmup period and only start decaying during the last `decay_ratio` portion of the total training steps, which is known as the Warmup-Stable-Decay (WSD) schedule.
+- `--lr_scheduler.warmup_steps`: The number of steps for the learning rate warmup phase.
+- `--training.steps`: Total number of training steps.
+- `--training.batch_size`: Batch size per device, must be 1 if `--training.varlen` is set.
+- `--training.seq_len`: The length of each sequence in the batch, which is concatenated from multiple samples.
+- `--training.context_len`: The max allowed length of a sample. For non-varlen mode, this is equivalent to `seq_len`.
+- `--training.varlen`: Whether to conduct variable-length sequence training.
+- `--training.gradient_accumulation_steps`: Number of gradient accumulation steps.
+> [!WARNING]
+> The total number of tokens processed per batch, referred to as `global_batch_size`, is calculated as batch_size × gradient_accumulation_steps × num_gpus.
+> Each step processes `global_batch_size * seq_len` tokens.
+> Monitor the value of `global_batch_size`, `warmup_steps`, and `steps` carefully when modifying any of the hyperparameters!
+For a detailed explanation of all parameters, run:
+```sh
+bash train.sh -h
+```
+<details>
+<summary>Usage</summary>
+```py
+options:
+  -h, --help            show this help message and exit
+  --job.config_file JOB.CONFIG_FILE
+                        Job config file
+  --job.dump_folder JOB.DUMP_FOLDER
+                        Folder to dump job outputs
+  --job.description JOB.DESCRIPTION
+                        Description of the job
+  --job.use_for_integration_test
+                        Add this config to the integration test suite
+  --job.print_args      Print the args to terminal
+  --model.config MODEL.CONFIG
+                        Path to the model config
+  --model.norm_type MODEL.NORM_TYPE
+                        Type of layer normalization to use [layernorm,
+                        np_layernorm, rmsnorm, fused_rmsnorm]
+  --model.tokenizer_path MODEL.TOKENIZER_PATH
+                        Tokenizer path
+  --profiling.enable_profiling
+                        Whether to enable pytorch profiler
+  --profiling.save_traces_folder PROFILING.SAVE_TRACES_FOLDER
+                        Trace files location
+  --profiling.profile_freq PROFILING.PROFILE_FREQ
+                        How often to collect profiler traces, in iterations
+  --profiling.enable_memory_snapshot
+                        Whether to dump memory snapshot
+  --profiling.save_memory_snapshot_folder PROFILING.SAVE_MEMORY_SNAPSHOT_FOLDER
+                        Memeory snapshot files location
+  --optimizer.name OPTIMIZER.NAME
+                        Optimizer to use
+  --optimizer.eps OPTIMIZER.EPS
+                        Epsilon value for the optimizer.
+  --optimizer.fused     Whether the fused implementation(CUDA only) is used.
+  --optimizer.scheduler {wsd,cosine,linear}
+                        Scheduler to use. Currently supported: wsd, cosine,
+                        and linear.
+  --optimizer.lr OPTIMIZER.LR
+                        Learning rate to use
+  --optimizer.min_lr_ratio OPTIMIZER.MIN_LR_RATIO
+                        Min lr ratio for lr scheduler
+  --optimizer.early_step_in_backward
+                        Whether to apply optimizer in the backward. Caution,
+                        optimizer_in_backward is not compatible with gradients
+                        clipping, users should not call
+                        register_post_accumulate_grad_hook after the optimizer
+                        is built.
+  --training.batch_size TRAINING.BATCH_SIZE
+                        Batch size
+  --training.seq_len TRAINING.SEQ_LEN
+                        Sequence length
+  --training.context_len TRAINING.CONTEXT_LEN
+                        Max length allowed for each sequence
+  --training.varlen     Whether to take sequences of variable length as input
+  --training.warmup_steps TRAINING.WARMUP_STEPS
+                        Steps for lr scheduler warmup, normally 1/5 of
+                        --training.steps
+  --training.gradient_accumulation_steps TRAINING.GRADIENT_ACCUMULATION_STEPS
+                        Number of steps to accumulate gradients before
+                        updating parameters
+  --training.steps TRAINING.STEPS
+                        How many train steps to run
+  --training.max_norm TRAINING.MAX_NORM
+                        Max norm for gradient clipping
+  --training.skip_nan_inf
+                        Skip batch updates when NaN or INF gradients are
+                        encountered during training
+  --training.dataset TRAINING.DATASET
+                        Dataset to use, with comma separated values
+  --training.dataset_name TRAINING.DATASET_NAME
+                        The name of the dataset config, with comma separated
+                        values if provided
+  --training.dataset_split TRAINING.DATASET_SPLIT
+                        Dataset split to use, with comma separated values if
+                        provided
+  --training.data_dir TRAINING.DATA_DIR
+                        Data dirs to use, with comma separated values if
+                        provided
+  --training.data_files TRAINING.DATA_FILES
+                        Data files to use, with comma separated values if
+                        provided
+  --training.data_probs TRAINING.DATA_PROBS
+                        Data sampling probabilities, with comma separated
+                        values if provided
+  --training.streaming  Whether to load dataset in streaming mode, used for
+                        huge dataset
+  --training.num_workers TRAINING.NUM_WORKERS
+                        Number of subprocesses to use for data loading. 0
+                        means that the data will be loaded in the main
+                        process.
+  --training.prefetch_factor TRAINING.PREFETCH_FACTOR
+                        Number of batches loaded in advance by each worker.2
+                        means there will be a total of 2 * num_workers batches
+                        prefetched across all workers.
+  --training.data_parallel_replicate_degree TRAINING.DATA_PARALLEL_REPLICATE_DEGREE
+                        The `data_parallel_replicate_degree` argument
+                        specifies the degree of data parallelism for weight
+                        replication. When this value is greater than 1,
+                        weights will be replicated across
+                        `data_parallel_replicate_degree` ranks. If
+                        `data_parallel_shard_degree` is also greater than 1,
+                        the parallelism method used is HSDP (Hybrid Sharded
+                        Data Parallelism). Otherwise, the parallelism method
+                        used is DDP (Distributed Data Parallelism). 1 means
+                        disabled.
+  --training.data_parallel_shard_degree TRAINING.DATA_PARALLEL_SHARD_DEGREE
+                        The `data_parallel_shard_degree` argument specifies
+                        the degree of data parallelism for weight sharding.
+                        When this value is greater than 1, weights will be
+                        sharded across `data_parallel_shard_degree` ranks. If
+                        `data_parallel_replicate_degree` is also greater than
+                        1, the parallelism method used is HSDP (Hybrid Sharded
+                        Data Parallelism). Otherwise, the parallelism method
+                        used is FSDP (Fully Sharded Data Parallelism). -1
+                        means leftover ranks will be used (After
+                        DP_REPLICATE/SP/PP). Note that only
+                        `data_parallel_shard_degree` can be negative. 1 means
+                        disabled.
+  --training.enable_cpu_offload
+                        Whether to apply CPU offloading of parameters,
+                        gradients, and optimizer states in FSDP
+  --training.tensor_parallel_degree TRAINING.TENSOR_PARALLEL_DEGREE
+                        Tensor Parallelism degree. 1 means disabled.
+  --training.disable_loss_parallel
+                        Whether to apply loss parallel when sequence parallel
+                        is enabled
+  --training.mixed_precision_param {bfloat16,float32}
+                        torch dtype to use for parameters when applying mixed
+                        precision via FSDP. This feature only takes effect
+                        when data_parallel_shard_degree > 1
+  --training.mixed_precision_reduce {float32}
+                        torch dtype to use for reductions when applying mixed
+                        precision via FSDP. This feature only takes effect
+                        when data_parallel_shard_degree > 1
+  --training.compile    Whether to compile the model
+  --training.gc_freq TRAINING.GC_FREQ
+                        Python garbage control scheduling interval, in steps
+  --training.seed TRAINING.SEED
+                        Choose the base RNG seed used for training
+  --training.deterministic
+                        Use deterministic algorithms wherever possible, may be
+                        slower
+  --metrics.log_freq METRICS.LOG_FREQ
+                        How often to log metrics to TensorBoard, in iterations
+  --metrics.enable_tensorboard
+                        Whether to log metrics to TensorBoard
+  --metrics.disable_color_printing
+                        Whether to disable color printing in logs
+  --metrics.save_tb_folder METRICS.SAVE_TB_FOLDER
+                        Folder to dump TensorBoard states
+  --metrics.rank_0_only
+                        Whether to save TensorBoard metrics only for rank 0 or
+                        for all ranks. When pipeline_parallel_degree is > 1,
+                        this option uses the 0th rank of the last stage
+                        pipeline group, which is the only stage that computes
+                        loss metrics.
+  --metrics.enable_wandb
+                        Whether to log metrics to Weights & Biases
+  --experimental.enable_async_tensor_parallel
+                        Whether to apply async tensor parallel (currently only
+                        effective when compile is enabled)
+  --experimental.pipeline_parallel_degree EXPERIMENTAL.PIPELINE_PARALLEL_DEGREE
+                        Pipeline Parallelism degree, or number of ranks. 1
+                        means disabled. If using looped schedules, this still
+                        specifies the number of physical ranks, not the number
+                        of stages. Stages per rank are inferred from split
+                        points degree, and schedule.
+  --experimental.pipeline_parallel_split_points EXPERIMENTAL.PIPELINE_PARALLEL_SPLIT_POINTS [EXPERIMENTAL.PIPELINE_PARALLEL_SPLIT_POINTS ...]
+                        Specify comma-separated names of modules to use as the
+                        beginning of a split point. e.g. "layers.0,layers.2"
+                        will cause the model to be split into 3 stages, the
+                        first containing all the layers up to layers.0, the
+                        second containing layers.0 and up to layers.2, the
+                        third containing layers.2 and all the remaining
+                        layers. Note: fully-automated splitting may be enabled
+                        in the future, but currently the split points must be
+                        specified manually.
+  --experimental.pipeline_parallel_schedule EXPERIMENTAL.PIPELINE_PARALLEL_SCHEDULE
+                        Specify the Pipeline Parallel schedule to use. The
+                        supported schedules are: https://github.com/pytorch/py
+                        torch/blob/de4c2a3b4e89d96334dc678d1c3f2ae51a6630a0/to
+                        rch/distributed/pipelining/schedules.py#L2161. The
+                        schedule must be compatible with the split points and
+                        stages_per_rank. Looped schedules (e.g.
+                        Interleaved1F1B) require specifying
+                        pipeline_parallel_degree = number of ranks, and
+                        split_points = number of stages - 1
+  --experimental.pipeline_parallel_schedule_csv EXPERIMENTAL.PIPELINE_PARALLEL_SCHEDULE_CSV
+                        Specify the path to the pipeline parallel schedule csv
+                        file to use. The pipeline_parallel_schedule argument
+                        must be either PipelineScheduleSingle,
+                        PipelineScheduleMulti, or _PipelineScheduleRuntime.
+  --experimental.pipeline_parallel_microbatches EXPERIMENTAL.PIPELINE_PARALLEL_MICROBATCHES
+                        How many microbatches to split the global training
+                        batch into when using pipeline parallelism. The global
+                        training batch size must be evenly divisible by the
+                        number of microbatches. The default value will be the
+                        number of pipeline stages, if unspecified.
+  --experimental.enable_compiled_autograd
+                        Enable CompiledAutograd to compile the backward.
+  --experimental.context_parallel_degree EXPERIMENTAL.CONTEXT_PARALLEL_DEGREE
+                        Context parallelism degree. 1 means disabled.
+  --experimental.context_parallel_rotate_method EXPERIMENTAL.CONTEXT_PARALLEL_ROTATE_METHOD
+                        The collective to use in context parallel SDPA for kv
+                        shards exchange. 'allgather' means to all-gather all
+                        kv shards on ranks after the first sub-SDPA
+                        computation, 'alltoall' means to all-to-all shuffle
+                        the kv shards. The default value is 'allgather'.
+  --checkpoint.enable_checkpoint
+                        Whether to enable checkpoint
+  --checkpoint.folder CHECKPOINT.FOLDER
+                        The folder to store the checkpoints. When
+                        enable_checkpoint is set to true, checkpoints will be
+                        in {--job.dump_folder}/{--checkpoint.folder}.
+  --checkpoint.interval_type CHECKPOINT.INTERVAL_TYPE
+                        Checkpointing interval unit of measurement ['step',
+                        'seconds']
+  --checkpoint.interval CHECKPOINT.INTERVAL
+                        Checkpointing interval, in steps or seconds depending
+                        on --checkpoint.interval_type
+  --checkpoint.model_weights_only
+                        When model_weights_only=True, only model weights will
+                        be saved at the end of training. With this,
+                        checkpoints can be loaded using `torch.load(...,
+                        weights_only=True)` after conversion. When
+                        model_weights_only=False, the full checkpoint will be
+                        saved. A full checkpoint includes model, optimizer and
+                        train_state, which can be used to resume training. The
+                        default value is false.
+  --checkpoint.export_dtype {float16,bfloat16,float32}
+                        Converts to the specified precision when training
+                        completes and model_weights_only=true. Currently
+                        supports float32, float16, and bfloat16. The default
+                        value is float32.
+  --checkpoint.create_seed_checkpoint
+                        Initializes the full model without applying
+                        parallelisms, and then saves it as a seed checkpoint.
+                        Note: requires user to call train.py without
+                        specifying any parallelisms, e.g. NGPU=1. Could be
+                        implemented as a separate script, but this way shares
+                        more code.
+  --checkpoint.async_mode CHECKPOINT.ASYNC_MODE
+                        Which async checkpoint mode to use. Currently there
+                        are 3 different modes. 1. "disabled": synchronized
+                        checkpointing will be used. 2. "async":
+                        torch.distributed.checkpoint.async_save will be used.
+                        1. "async_with_pinned_mem": this option utilizes a
+                        dedicated pinned memory space and creates a separate
+                        process for faster GPU->CPU transfer performance and
+                        eliminating GIL contention. The cost is increased CPU
+                        memory usage. If insufficient CPU memory is available,
+                        performance may degrade due to memory paging. For most
+                        users, "async" should suffice as the performance
+                        overhead is typically small (on the order of tens of
+                        seconds) compared to checkpointing frequency. This
+                        mode can be employed to pursue near-zero checkpointing
+                        times (e.g., < 1 second) given appropriate hardware
+                        support such as ample CPU memory and fast PCIe.
+                        "disabled" is the default mode.
+  --checkpoint.keep_latest_k CHECKPOINT.KEEP_LATEST_K
+                        Keeps only the latest k checkpoints, and purging older
+                        ones. If 0, keep all checkpoints. 0 is the default
+                        value.
+  --checkpoint.load_step CHECKPOINT.LOAD_STEP
+                        Load the checkpoint at the specified step. If -1, load
+                        the latest checkpoint.
+  --float8.enable_float8_linear
+                        If true, swaps `torch.nn.Linear` with `Float8Linear`.
+                        This feature requires you to install 'torchao' which
+                        can be found here: https://github.com/pytorch/ao
+  --float8.enable_fsdp_float8_all_gather
+                        Whether enable float8 all-gather in FSDP
+  --float8.precompute_float8_dynamic_scale_for_fsdp
+                        Whether precompute float8 scales dynamically for FSDP
+  --float8.scaling_type_input {dynamic,delayed}
+                        float8 scaling for input, dynamic (default) or delayed
+  --float8.scaling_type_weight FLOAT8.SCALING_TYPE_WEIGHT
+                        float8 scaling for input, dynamic (default) or delayed
+  --float8.scaling_type_grad_output FLOAT8.SCALING_TYPE_GRAD_OUTPUT
+                        float8 scaling for input, dynamic (default) or delayed
+  --comm.init_timeout_seconds COMM.INIT_TIMEOUT_SECONDS
+                        Timeout for communication operations, during
+                        initialization and first train step.
+  --comm.train_timeout_seconds COMM.TRAIN_TIMEOUT_SECONDS
+                        Timeout for communication operations after the first
+                        train step -- usually a tighter bound than during
+                        initialization.
+  --comm.trace_buf_size COMM.TRACE_BUF_SIZE
+                        Flight recorder ring buffer size, >0 means recording
+                        by default, 0 means disabled
+  --memory_estimation.enabled
+                        Whether to estimate memory usage for FSDP
+  --memory_estimation.disable_fake_mode
+                        Whether to estimate memory under FakeTensorMode
+```
+</details>
+### Training with `torch.compile`
+Starting from `torch 2.0`, `torch.compile` has been introduced as a new feature to seamlessly accelerate training processes.
+In `flame`, one can simply enable `torch.compile` by adding `--training.compile` flag to your training script.
+However, `fla` has integrated numerous fused kernels for acceleration, which may potentially conflict with `torch.compile`.
+We are actively working on resolving these issues to make compilation transparent to users.
+In the meantime, please ensure you are using the latest dependencies.
+Specifically, **we recommend using `torch>=2.6` and `triton>=3.0`**.
+### Training with multiple datasets
+If you wish to train a model with all-round capabilities (e.g., code, math, and multilingual ability), it's necessary to train on multiple datasets.
+`flame` allows training with multiple datasets easily.
+For example, you can specify the following arguments to train on 6 datasets with different proportions:
+```sh
+  --training.dataset HuggingFaceFW/fineweb-edu,opencsg/Fineweb-Edu-Chinese-V2.1,OpenCoder-LLM/opc-fineweb-code-corpus,math-ai/AutoMathText,EleutherAI/proof-pile-2,OpenCoder-LLM/opc-fineweb-math-corpus   \
+  --training.data_probs 0.6,0.15,0.15,0.014,0.058,0.028     \
+```
+### ~Finalizing training~
+> [!NOTE]
+> We have done this conversion automatically in the training script since our latest updates.
+Once training is complete, you may want to convert the distributed checkpoints (DCPs) into the 🤗 format for broader use.
+To facilitate this, we provide a straightforward conversion script:
+```sh
+python -m flame.utils.convert_dcp_to_hf --path <path_to_model> --step <step> --config <path_to_config> --tokenizer <path_to_tokenizer>
+```
+After this, your model will be in the 🤗 format, ready to be shared or deployed.
+You can then easily publish your model using the `huggingface_hub` for wider accessibility.
+### Continual training
+If you wish to build upon a strong pre-trained model (in 🤗 format) and continue training, we also offer a script to convert the 🤗 format model back into DCP format.
+This allows you to seamlessly resume training with `flame`.
+```sh
+python -m flame.utils.convert_hf_to_dcp --model <path_to_hf> --checkpoint <path_to_dcp/checkpoint/step-0>
+```
+Here, `<path_to_dcp>` is the directory where your distributed checkpoints will be stored.
+The checkpoint is intentionally saved at `<step-0>` within the checkpoint folder to ensure it is loadable by `flame` during the initial training step, similar to how a seed checkpoint is handled.
+Once the conversion is complete, you can proceed with training using `flame` as usual, continuing from where the pretrained model left off.
+## Multi-node training
+If you have access to multi-node GPUs, consider leveraging them for optimal performance.
+This process is straightforward and well-documented in the PyTorch [docs](https://pytorch.org/docs/stable/elastic/run.html).
+To set up multi-node training:
+* Set the environment variables `MASTER_ADDR=<ip>` and `MASTER_PORT=<port>` before running the training script across all nodes.
+* If you're using a job scheduler like Slurm, it will handle these variables for you.
+`torchtitan` provides a [Slurm script](https://github.com/pytorch/torchtitan/blob/main/multinode_trainer.slurm) for multi-node training, which you can use as a reference or starting point.

config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "architectures": [
+    "MTPTransformerForCausalLM"
+  ],
+  "attention_bias": false,
+  "bos_token_id": 1,
+  "elementwise_affine": true,
+  "eos_token_id": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.006,
+  "intermediate_size": null,
+  "max_position_embeddings": 8192,
+  "model_type": "mtp_transformer",
+  "n_future_tokens": 3,
+  "norm_eps": 1e-06,
+  "num_heads": 16,
+  "num_hidden_layers": 24,
+  "num_kv_heads": null,
+  "qk_norm": false,
+  "qkv_bias": false,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_custom_backward": false,
+  "vocab_size": 32000,
+  "window_size": null
+}

configs/delta_net_1B.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "attn": null,
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 2048,
+    "initializer_range": 0.006,
+    "intermediate_size": null,
+    "model_type": "delta_net",
+    "norm_eps": 1e-06,
+    "num_heads": 16,
+    "num_hidden_layers": 24,
+    "pad_token_id": 2,
+    "qk_activation": "silu",
+    "qk_norm": "l2",
+    "tie_word_embeddings": false,
+    "use_beta": true,
+    "use_cache": true,
+    "use_gate": false,
+    "use_output_norm": true,
+    "use_short_conv": true
+}

configs/delta_net_340M.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.006,
+    "intermediate_size": null,
+    "model_type": "delta_net",
+    "norm_eps": 1e-06,
+    "norm_first": false,
+    "num_heads": 8,
+    "num_hidden_layers": 24,
+    "qk_activation": "silu",
+    "qk_norm": "l2",
+    "tie_word_embeddings": false,
+    "use_beta": true,
+    "use_cache": true,
+    "use_gate": false,
+    "use_output_norm": true,
+    "use_short_conv": true
+}

configs/gla_340M.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "clamp_min": null,
+  "eos_token_id": 2,
+  "expand_k": 0.5,
+  "expand_v": 1,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.006,
+  "intermediate_size": null,
+  "model_type": "gla",
+  "num_heads": 4,
+  "num_hidden_layers": 24,
+  "norm_eps": 1e-06,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_gk": true,
+  "use_gv": false,
+  "vocab_size": 32000
+}

configs/gla_7B.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "attn": null,
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "expand_k": 0.5,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 4096,
+    "initializer_range": 0.006,
+    "intermediate_size": 11008,
+    "model_type": "gla",
+    "norm_eps": 1e-06,
+    "num_heads": 16,
+    "num_hidden_layers": 32,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_gk": true,
+    "use_gv": false,
+    "use_output_gate": true,
+    "use_short_conv": false
+}

configs/gsa_340M.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "elementwise_affine": false,
+    "feature_map": "swish",
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "gate_logit_normalizer": 4,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.006,
+    "intermediate_size": null,
+    "model_type": "gsa",
+    "num_heads": 4,
+    "num_hidden_layers": 24,
+    "num_slots": 64,
+    "norm_eps": 1e-06,
+    "share_conv_kernel": true,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_norm": true,
+    "use_output_gate": true,
+    "use_rope": false,
+    "use_short_conv": false
+}

configs/mtp_transformer_340M.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_size": 1024,
+    "initializer_range": 0.006,
+    "max_position_embeddings": 8192,
+    "model_type": "mtp_transformer",
+    "num_heads": 16,
+    "num_hidden_layers": 24,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "vocab_size": 32000,
+    "n_future_tokens": 3
+}

configs/top_transformer_1B.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+    "bos_token_id": 1,
+    "elementwise_affine": true,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "fuse_swiglu": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 2048,
+    "initializer_range": 0.006,
+    "intermediate_size": null,
+    "max_position_embeddings": 8192,
+    "model_type": "top_transformer",
+    "norm_eps": 1e-06,
+    "num_heads": 32,
+    "num_hidden_layers": 32,
+    "num_kv_heads": null,
+    "pad_token_id": 2,
+    "rope_theta": 10000.0,
+    "tie_word_embeddings": false,
+    "use_top_loss": true,
+    "top_window_size": 4096
+}

configs/top_transformer_340M.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_size": 1024,
+    "initializer_range": 0.006,
+    "max_position_embeddings": 8192,
+    "model_type": "top_transformer",
+    "num_heads": 16,
+    "num_hidden_layers": 24,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "vocab_size": 32000,
+    "use_top_loss": true,
+    "top_window_size": 4096
+}

configs/transformer_120M.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": false,
+    "hidden_act": "swish",
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "max_position_embeddings": 4096,
+    "model_type": "transformer",
+    "num_heads": 12,
+    "num_hidden_layers": 14,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": true,
+    "use_cache": true,
+    "vocab_size": 32000
+}

configs/transformer_7B.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 4096,
+    "initializer_range": 0.006,
+    "intermediate_size": 14336,
+    "model_type": "transformer",
+    "norm_eps": 1e-06,
+    "num_heads": 32,
+    "num_hidden_layers": 30,
+    "num_kv_heads": 8,
+    "rope_theta": 10000.0,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "window_size": null
+}

fla/utils.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# -*- coding: utf-8 -*-
+import contextlib
+import functools
+import os
+from enum import Enum
+from functools import lru_cache
+from typing import Any, Callable, Dict, Literal, Optional, Tuple
+import torch
+import triton
+from packaging import version
+def tensor_cache(
+    fn: Callable[..., torch.Tensor]
+) -> Callable[..., torch.Tensor]:
+    """
+    A decorator that caches the most recent result of a function with tensor inputs.
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    If the function is called again with the same input tensors, it will return the cached result.
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+    last_args: Optional[Tuple] = None
+    last_kwargs: Optional[Dict] = None
+    last_result: Any = None
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal last_args, last_kwargs, last_result
+        if last_args is not None and last_kwargs is not None:
+            if len(args) == len(last_args) and len(kwargs) == len(last_kwargs):
+                if all(a is b for a, b in zip(args, last_args)) and \
+                        all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items()):
+                    return last_result
+        result = fn(*args, **kwargs)
+        last_args, last_kwargs, last_result = args, kwargs, result
+        return result
+    return wrapper
+def input_guard(
+    fn: Callable[..., torch.Tensor]
+) -> Callable[..., torch.Tensor]:
+    """
+    A decorator to make sure all input tensors are contiguous and set the device based on input tensors.
+    """
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        contiguous_args = (i if not isinstance(i, torch.Tensor) else i.contiguous() for i in args)
+        contiguous_kwargs = {k: (v if not isinstance(v, torch.Tensor) else v.contiguous()) for k, v in kwargs.items()}
+        tensor = None
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                tensor = arg
+                break
+        if tensor is None:
+            for value in kwargs.values():
+                if isinstance(value, torch.Tensor):
+                    tensor = value
+                    break
+        if tensor is not None:
+            ctx = custom_device_ctx(tensor.device.index)
+        else:
+            ctx = contextlib.nullcontext()
+        with ctx:
+            return fn(*contiguous_args, **contiguous_kwargs)
+    return wrapper
+contiguous = input_guard
+def require_version(version, hint):
+    """
+    Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
+    """
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(ctx, *args, **kwargs):
+            from transformers.utils.versions import require_version
+            require_version(version, hint)
+            return fn(ctx,
+                      *(i if not isinstance(i, torch.Tensor) else i.contiguous() for i in args),
+                      **{k: (v if not isinstance(v, torch.Tensor) else v.contiguous()) for k, v in kwargs.items()})
+        return wrapper
+    return decorator
+def checkpoint(fn):
+    def wrapper(*args, **kwargs):
+        return torch.utils.checkpoint.checkpoint(fn, *args, **kwargs)
+    return wrapper
+@lru_cache(maxsize=None)
+def check_pytorch_version(version_s: str = '2.4') -> bool:
+    return version.parse(torch.__version__) >= version.parse(version_s)
+def _cpu_device_warning():
+    import warnings
+    warnings.warn(('Triton is not supported on current platform, roll back to CPU.'), stacklevel=1)
+@lru_cache(maxsize=None)
+def get_multiprocessor_count(tensor_idx: int = 0) -> int:
+    try:
+        # Only works if Homogeneous hardware
+        # TEMPORARY FIX since old version introduce graph break
+        return torch.cuda.get_device_properties().multi_processor_count
+    except BaseException:
+        _cpu_device_warning()
+        return -1
+@lru_cache(maxsize=None)
+def get_available_device() -> str:
+    try:
+        return triton.runtime.driver.active.get_current_target().backend
+    except BaseException:
+        _cpu_device_warning()
+        return 'cpu'
+@lru_cache(maxsize=None)
+def _check_platform() -> Literal['nvidia', 'amd', 'intel', 'musa']:
+    device = get_available_device()
+    if device == 'cuda':
+        return 'nvidia'
+    elif device == 'hip':
+        return 'amd'
+    elif device == 'xpu':
+        return 'intel'
+    else:
+        return device
+# For AMD GPUs, the triton backend is 'hip', while for Nvidia GPUs, the triton backend is 'cuda'.
+# However, the torch backend is 'cuda' for both Nvidia and AMD GPUs.
+# Therefore, we need to check the triton backend to determine the actual GPU vendor.
+device = get_available_device() if get_available_device() != 'hip' else 'cuda'
+device_torch_lib = getattr(torch, device)
+device_platform = _check_platform()
+is_amd = (device_platform == 'amd')
+is_intel = (device_platform == 'intel')
+is_nvidia = (device_platform == 'nvidia')
+is_intel_alchemist = (is_intel and 'Intel(R) Arc(TM) A' in torch.xpu.get_device_name(0))
+is_nvidia_hopper = (is_nvidia and ('NVIDIA H' in torch.cuda.get_device_name(0) or torch.cuda.get_device_capability()[0] >= 9))
+use_cuda_graph = (is_nvidia and os.environ.get('FLA_USE_CUDA_GRAPH', '0') == '1')
+# Nvidia Ampere or newer, haven't check AMD and intel yet.
+is_tf32_supported = (is_nvidia and torch.cuda.get_device_capability(0)[0] >= 8)
+is_gather_supported = hasattr(triton.language, 'gather')
+def get_all_max_shared_mem():
+    try:
+        return [
+            triton.runtime.driver.active.utils.get_device_properties(i)['max_shared_mem']
+            for i in range(device_torch_lib.device_count())
+        ]
+    except BaseException:
+        _cpu_device_warning()
+        return [-1]
+class Backend(Enum):
+    ADA = 101376       # RTX 4090
+    AMPERE = 166912    # A100
+    HOPPER = 232448    # H100
+    DEFAULT = 102400   # Default
+    @classmethod
+    def get_shared_memory(cls, arch: str) -> int:
+        try:
+            return cls[arch.upper()].value
+        except KeyError:
+            return cls.DEFAULT.value
+@lru_cache(maxsize=None)
+def check_shared_mem(arch: str = "none", tensor_idx: int = 0) -> bool:
+    try:
+        device_shared_mem_list = get_all_max_shared_mem()
+        max_shared_memory = device_shared_mem_list[tensor_idx]
+        return max_shared_memory >= Backend.get_shared_memory(arch)
+    except Exception:
+        return False
+if check_pytorch_version('2.4'):
+    device = 'cuda' if device == 'cpu' else device
+    autocast_custom_fwd = functools.partial(torch.amp.custom_fwd, device_type=device)
+    autocast_custom_bwd = functools.partial(torch.amp.custom_bwd, device_type=device)
+    def custom_device_ctx(index: int):
+        return device_torch_lib.device(index)
+else:
+    assert device == 'cuda', 'Only cuda device is supported for PyTorch version < 2.4.0.'
+    autocast_custom_fwd = device_torch_lib.amp.custom_fwd
+    autocast_custom_bwd = device_torch_lib.amp.custom_bwd
+    def custom_device_ctx(index: int):
+        return torch.cuda.device(index)

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.51.3"
+}

logs/none_ro0qpaac/attempt_0/0/stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/none_ro0qpaac/attempt_0/1/stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/none_ro0qpaac/attempt_0/3/stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/none_ro0qpaac/attempt_0/5/stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/none_ro0qpaac/attempt_0/6/stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/none_ro0qpaac/attempt_0/7/stderr.log ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,43 @@

+[project]
+name = "flame"
+dynamic = ["version"]
+description = "A minimal training framework for scaling FLA models"
+readme = "README.md"
+authors = [
+    { name = "Songlin Yang", email = "yangsl66@mit.edu" },
+    { name = "Yu Zhang", email = "yzhang.cs@outlook.com" },
+]
+license = { file = "LICENSE" }
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+requires-python = ">=3.10"
+dependencies = [
+    'torch==2.6',
+    'torchdata',
+    'transformers==4.51.3',
+    'triton>=3.0',
+    'datasets>=3.3.0',
+    'einops',
+    'ninja',
+    'wandb',
+    'tiktoken',
+    'tensorboard',
+    'python-dotenv'
+]
+[project.optional-dependencies]
+dev = ["pytest"]
+[project.urls]
+Homepage = "https://github.com/fla-org/flame"
+[build-system]
+requires = ["setuptools>=45", "wheel", "ninja", "torch"]
+[tool.isort]
+line_length = 127
+multi_line_output = 3

setup.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# -*- coding: utf-8 -*-
+import ast
+import os
+import re
+from pathlib import Path
+from setuptools import find_packages, setup
+with open('README.md') as f:
+    long_description = f.read()
+def get_package_version():
+    with open(Path(os.path.dirname(os.path.abspath(__file__))) / 'flame' / '__init__.py') as f:
+        version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
+    return ast.literal_eval(version_match.group(1))
+setup(
+    name='flame',
+    version=get_package_version(),
+    description='A minimal training framework for scaling FLA models',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    author='Songlin Yang, Yu Zhang',
+    author_email='yangsl66@mit.edu, yzhang.cs@outlook.com',
+    url='https://github.com/fla-org/flame',
+    packages=find_packages(),
+    license='MIT',
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: OS Independent',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence'
+    ],
+    python_requires='>=3.10',
+    install_requires=[
+        'torch==2.6',
+        'torchdata',
+        'transformers==4.51.3',
+        'triton>=3.0',
+        'datasets>=3.3.0',
+        'einops',
+        'ninja',
+        'wandb',
+        'tiktoken',
+        'tensorboard',
+        'python-dotenv'
+    ],
+)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

torchtitan/components/__pycache__/checkpoint.cpython-312.pyc ADDED Viewed

Binary file (33.1 kB). View file

torchtitan/components/__pycache__/dataloader.cpython-312.pyc ADDED Viewed

Binary file (3.79 kB). View file

torchtitan/components/__pycache__/float8.cpython-312.pyc ADDED Viewed

Binary file (6.2 kB). View file

torchtitan/components/__pycache__/lr_scheduler.cpython-312.pyc ADDED Viewed

Binary file (7.71 kB). View file

torchtitan/components/__pycache__/metrics.cpython-312.pyc ADDED Viewed

Binary file (19.6 kB). View file

torchtitan/components/__pycache__/optimizer.cpython-312.pyc ADDED Viewed

Binary file (14.5 kB). View file

torchtitan/components/__pycache__/tokenizer.cpython-312.pyc ADDED Viewed

Binary file (1.09 kB). View file

torchtitan/distributed/utils.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import contextlib
+import math
+import os
+from collections.abc import Generator, Iterable
+from datetime import timedelta
+import torch
+import torch.distributed._functional_collectives as funcol
+import torch.distributed.distributed_c10d as c10d
+from torch import distributed as dist
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor import DTensor
+from torchtitan.components.ft import ft_clip_grad_norm_util, ft_dist_reduce
+from torchtitan.tools.logging import logger
+from torchtitan.tools.utils import device_module, device_type
+def _dist_reduce(x: torch.Tensor, reduceOp: str, mesh: DeviceMesh) -> float:
+    # Remove FT replicate dimension if it exists.
+    x, reduceOp, mesh = ft_dist_reduce(x, reduceOp, mesh)
+    if isinstance(x, DTensor):
+        # functional collectives do not support DTensor inputs
+        x = x.full_tensor()
+    assert x.numel() == 1  # required by `.item()`
+    return funcol.all_reduce(x, reduceOp=reduceOp, group=mesh).item()
+def dist_max(x: torch.Tensor, mesh: DeviceMesh) -> float:
+    return _dist_reduce(x, reduceOp=c10d.ReduceOp.MAX.name, mesh=mesh)
+def dist_mean(x: torch.Tensor, mesh: DeviceMesh) -> float:
+    return _dist_reduce(x, reduceOp=c10d.ReduceOp.AVG.name, mesh=mesh)
+def set_determinism(
+    world_mesh: DeviceMesh | None,
+    device: torch.device,
+    seed: int | None = None,
+    deterministic: bool = False,
+) -> None:
+    """
+    Set the same DTensor manual seed for all ranks within the same DTensor SPMD group, but different
+    seeds across PP groups (if applicable).
+    Currently, does not set seeds for the CUDA RNG since TorchTitan always uses DTensor for SPMD parallelisms,
+    and DTensor manages its own RNG tracker, but we could extend to support both if needed.
+    Set Determinism flags for increased reproducibility with loss of performance.
+    """
+    if deterministic:
+        logger.info("Deterministic algorithm enabled (expect perf degradation).")
+        torch.use_deterministic_algorithms(True)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+        # env var for deterministic CuBLAS
+        # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+    if not world_mesh:
+        if seed is not None:
+            torch.manual_seed(seed)
+            os.environ["PYTHONHASHSEED"] = str(seed % 2**32)
+            logger.debug(f"Single-process job using seed: {seed}")
+        return
+    # to ensure we can control which ranks have same or different seeds, all ranks agree on a starting seed.
+    # if user provides one, we use this. Otherwise rank 0 rolls the dice and everyone else uses that.
+    if seed is None:
+        # Extract the seed for torch's main generator on rank 0 and standardizes on using that to build
+        # seeds for unique SPMD groups
+        seed_tensor = torch.get_rng_state()[:8].to(device)
+        torch.distributed.broadcast(seed_tensor, src=0)
+        seed = seed_tensor.to("cpu").view(torch.uint64).item()
+    # For PP + SPMD cases, we want to separate the world into the SPMD mesh and the PP mesh,
+    # and choose a unique seed for each rank on the PP mesh.
+    if c10d.get_world_size() > 1 and "pp" in world_mesh.mesh_dim_names:
+        pp_mesh = world_mesh["pp"]
+        seed += pp_mesh.get_local_rank()
+        seed %= 2**64
+        logger.debug(
+            f"PP rank {pp_mesh.get_local_rank()}, Global rank {c10d.get_rank()} using seed: {seed}"
+        )
+        spmd_mesh_dims = list(
+            filter(lambda name: name != "pp", world_mesh.mesh_dim_names)
+        )
+        spmd_mesh = world_mesh[spmd_mesh_dims] if len(spmd_mesh_dims) else None
+    else:
+        spmd_mesh = world_mesh
+        logger.debug(f"Global Rank {c10d.get_rank()} using seed: {seed}")
+    # The native RNGs and python RNG may not be important, except for the 1-D PP case, but we seed them for consistency.
+    torch.manual_seed(seed)
+    # PYTHONHASHSEED can be a decimal number in the range [0, 2**32 - 1]
+    os.environ["PYTHONHASHSEED"] = str(seed % 2**32)
+    # As long as we are not in the 1-D (PP-only) case, we will have a seed to use for all ranks of the SPMD mesh.
+    # IF PP is also used, this seed is unique per PP rank.
+    if spmd_mesh and spmd_mesh.get_coordinate() is not None:
+        torch.distributed.tensor._random.manual_seed(seed, spmd_mesh)
+def create_context_parallel_ctx(
+    cp_mesh: DeviceMesh,
+    cp_buffers: list[torch.Tensor],
+    cp_seq_dims: list[int],
+    cp_no_restore_buffers: set[torch.Tensor],
+    cp_rotate_method: str,
+):
+    try:
+        from torch.distributed.tensor.experimental import context_parallel
+        from torch.distributed.tensor.experimental._attention import set_rotate_method
+    except ImportError:
+        print(
+            f"PyTorch version {torch.__version__} does not include the experimental "
+            "Context Parallel API. Please update to a newer version."
+        )
+    set_rotate_method(cp_rotate_method)
+    return context_parallel(
+        cp_mesh,
+        buffers=cp_buffers,
+        buffer_seq_dims=cp_seq_dims,
+        no_restore_buffers=cp_no_restore_buffers,
+    )
+def get_train_context(
+    enable_loss_parallel: bool, enable_compiled_autograd: bool
+) -> Generator[None, None, None]:
+    @contextlib.contextmanager
+    def context(cp_context: Generator[None, None, None] | None = None):
+        with contextlib.ExitStack() as stack:
+            if enable_loss_parallel:
+                stack.enter_context(torch.distributed.tensor.parallel.loss_parallel())
+            if enable_compiled_autograd:
+                stack.enter_context(
+                    torch._dynamo.utils.maybe_enable_compiled_autograd(True)
+                )
+            if cp_context is not None:
+                from torch.nn.attention import sdpa_kernel, SDPBackend
+                stack.enter_context(
+                    sdpa_kernel(
+                        [
+                            SDPBackend.FLASH_ATTENTION,
+                            SDPBackend.EFFICIENT_ATTENTION,
+                            SDPBackend.CUDNN_ATTENTION,
+                        ]
+                    )
+                )
+                stack.enter_context(cp_context)
+            yield
+    return context
+def init_distributed(job_config):
+    def _warn_overwrite_env(env, val):
+        if env in os.environ:
+            logger.warning(
+                f"ENV[{env}] = {os.environ[env]} will be overridden to {val} based on job config"
+            )
+        os.environ[env] = val
+    def _get_distributed_backend(job_config):
+        backend = "nccl"
+        if device_type in torch.distributed.Backend.default_device_backend_map:
+            backend = torch.distributed.Backend.default_device_backend_map.get(
+                device_type
+            )
+        if job_config.training.enable_cpu_offload:
+            backend = f"{device_type}:{backend},cpu:gloo"
+        return backend
+    TRACE_BUFFER_SIZE = "TORCH_NCCL_TRACE_BUFFER_SIZE"
+    TRACE_FILE = "TORCH_NCCL_DEBUG_INFO_TEMP_FILE"
+    DUMP_ON_TIMEOUT = "TORCH_NCCL_DUMP_ON_TIMEOUT"
+    ASYNC_ERROR_HANDLING = "TORCH_NCCL_ASYNC_ERROR_HANDLING"
+    SKIP_CLEANUP = "3"
+    # FlightRecorder is incompatible with =1 mode where watchdog aborts work, must use =3 (skipcleanup)
+    # to get flight recorder dumps. See https://github.com/pytorch/pytorch/issues/121055
+    # This could be done only when flight recorder is enabled, but its nice to be consistent to avoid subtle
+    # behavior differences
+    _warn_overwrite_env(ASYNC_ERROR_HANDLING, SKIP_CLEANUP)
+    # enable torch nccl flight recorder in the mode that would dump files if timeout is detected
+    _warn_overwrite_env(TRACE_BUFFER_SIZE, str(job_config.comm.trace_buf_size))
+    if job_config.comm.trace_buf_size > 0:
+        # dump on timeout by default if trace buffer is enabled
+        _warn_overwrite_env(DUMP_ON_TIMEOUT, "1")
+        dump_dir = f"{job_config.job.dump_folder}/comm_trace"
+        os.makedirs(dump_dir, exist_ok=True)
+        _warn_overwrite_env(TRACE_FILE, f"{dump_dir}/rank_")
+    # to mitigate the memory issue that collectives using
+    # async_op=True hold memory longer than they should
+    # such as those in tensor parallelism
+    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+    torch.distributed.init_process_group(
+        backend=_get_distributed_backend(job_config),
+        timeout=timedelta(seconds=job_config.comm.init_timeout_seconds),
+    )
+def set_pg_timeouts(timeout, world_mesh):
+    """
+    Sets the timeout for all PGs in the provided mesh, and the default (world) group.
+    Note: synchronizes via a barrier, before changing the timeouts. This is important, because
+    otherwise you may face a race where the slow rank has not reached the timeout reduction point
+    yet due to slow operations permitted under the old timeout value, but other faster ranks may
+    start issuing collectives under the new shorter timeout and then immediately timeout.
+    """
+    logger.info(
+        f"Synchronizing and adjusting timeout for all ProcessGroups to {timeout}"
+    )
+    # Ensure that all the ranks have reached the point of setting the new timeout-
+    # otherwise, some ranks may issue collectives with the new/shorter timeout and
+    # those may time out, before other ranks have finished with initialization done
+    # under the old/slow timeout.
+    torch.distributed.barrier(device_ids=[device_module.current_device()])
+    device_module.synchronize()
+    groups = [world_mesh.get_group(mesh_dim) for mesh_dim in range(world_mesh.ndim)]
+    # None represents the 'default' PG, not part of the mesh
+    groups.append(None)
+    for group in groups:
+        torch.distributed.distributed_c10d._set_pg_timeout(timeout, group)
+@torch.no_grad()
+def clip_grad_norm_(
+    parameters: torch.Tensor | Iterable[torch.Tensor],
+    max_norm: float,
+    norm_type: float = 2.0,
+    error_if_nonfinite: bool = False,
+    foreach: bool | None = None,
+    pp_mesh: DeviceMesh | None = None,
+) -> torch.Tensor:
+    """
+    Clip the gradient norm of an iterable of parameters.
+    Gradient norm clipping requires computing the gradient norm over the entire model.
+    `torch.nn.utils.clip_grad_norm_` only computes gradient norm along DP/FSDP/TP dimensions.
+    We need to manually reduce the gradient norm across PP stages.
+    See https://github.com/pytorch/torchtitan/issues/596 for details.
+    Args:
+        parameters: an iterable of Tensors or a single Tensor that will have gradients normalized
+        max_norm (float): max norm of the gradients
+        norm_type (float): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, an error is thrown if the total
+            norm of the gradients from :attr:`parameters` is ``nan``,
+            ``inf``, or ``-inf``. Default: False (will switch to True in the future)
+        foreach (bool): use the faster foreach-based implementation.
+            If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently
+            fall back to the slow implementation for other device types.
+            Default: ``None``
+        pp_mesh: pipeline parallel device mesh. If not None, will reduce gradient norm across PP stages.
+    Returns:
+        Total norm of the parameter gradients (viewed as a single vector).
+    """
+    grads = [p.grad for p in parameters if p.grad is not None]
+    total_norm = torch.nn.utils.get_total_norm(
+        grads, norm_type, error_if_nonfinite, foreach
+    )
+    # If total_norm is a DTensor, the placements must be `torch.distributed._tensor.ops.math_ops._NormPartial`.
+    # We can simply reduce the DTensor to get the total norm in this tensor's process group
+    # and then convert it to a local tensor.
+    # NOTE: It has two purposes:
+    #       1. to make sure the total norm is computed correctly when PP is used (see below)
+    #       2. to return a reduced total_norm tensor whose .item() would return the correct value
+    if isinstance(total_norm, DTensor):
+        # Will reach here if any non-PP parallelism is used.
+        # If only using PP, total_norm will be a local tensor.
+        # Remove FT replicate dimension if it exists.
+        total_norm = ft_clip_grad_norm_util(total_norm)
+        total_norm = total_norm.full_tensor()
+    if pp_mesh is not None:
+        if math.isinf(norm_type):
+            dist.all_reduce(total_norm, op=dist.ReduceOp.MAX, group=pp_mesh.get_group())
+        else:
+            total_norm **= norm_type
+            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=pp_mesh.get_group())
+            total_norm **= 1.0 / norm_type
+    torch.nn.utils.clip_grads_with_norm_(parameters, max_norm, total_norm, foreach)
+    return total_norm

torchtitan/experiments/deepseek_v3/README.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# Running DeepSeek in Titan  (experimental)
+This folder contains a DeepSeek model supporting v2 and v3 as well as kernels
+and scripts needed to run it.
+## Inference
+### Prerequisites:
+You will need to download a DeepSeek model's weights if you want to run a
+pre-trained checkpoint.  We provided a script to download the weights from
+HuggingFace Model Hub:
+```bash
+python download.py [vX]
+```
+where `vX` can be v2 or v3, both are supported. You may be required to create a
+HuggingFace account and log in first.
+### Running inference:
+The inference script is in `generate.py`. You can run it with the following
+command:
+```bash
+torchrun --standalone --nproc-per-node 4 generate.py
+```
+This will run inference on the `DeepSeek-V2-Lite-Chat` model using 4 GPUs by
+default.
+Alternatively, you can run inference by using `bash inference.sh`, optionally
+followed by your prompt.
+## Training
+The training script is in `train.py`. You can run it by the following command:
+```bash
+torchrun --standalone --nproc-per-node 8 train.py
+```
+This will run training on the `DeepSeek-V2-Lite-Chat` model using 8 GPUs by
+default, with pipeline parallel, expert parallel, and data parallel enabled.

torchtitan/experiments/deepseek_v3/indices.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import triton
+import triton.language as tl
+__all__ = ["generate_permute_indices"]
+@triton.jit
+def fill_indices_kernel(
+    tokens_per_expert_group_ptr,  # *Pointer* to first input vector.
+    start_index_values_ptr,  # *Pointer* to second input vector.
+    write_offsets_ptr,  # *Pointer* to third input vector.
+    output_ptr,  # *Pointer* to output vector.
+    experts_per_rank,  # Number of experts per rank.
+    num_ranks,  # Number of expert ranks.
+):
+    # There are multiple 'programs' processing different data. We identify which program
+    # we are here:
+    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+    # The total number of programs in the launch grid.
+    num_programs = tl.num_programs(axis=0)
+    # We map the programs (blocks) to the experts.
+    for expert_id in tl.range(pid, experts_per_rank, step=num_programs):
+        # Read this expert's write offset.
+        write_offset = tl.load(write_offsets_ptr + expert_id)
+        # Loop over the ranks.
+        for r in tl.range(num_ranks):
+            # Slot in the tokens_per_expert_group array.
+            i = r * experts_per_rank + expert_id
+            start_index = tl.load(start_index_values_ptr + i)
+            length = tl.load(tokens_per_expert_group_ptr + i)
+            # Write the indices.
+            for l in tl.range(length):
+                val = start_index + l
+                tl.store(output_ptr + write_offset + l, val)
+            write_offset += length
+def fill_indices(
+    tokens_per_expert_group: torch.Tensor,
+    start_index_values: torch.Tensor,
+    write_offsets: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+):
+    # We need to preallocate the output.
+    permuted_indices = torch.full(
+        (max_len,), -1, dtype=torch.int32, device=tokens_per_expert_group.device
+    )
+    # Analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int].
+    # In this case, we use a 1D grid where the size is the number of blocks (TODO: bump this value).
+    grid = lambda meta: (1,)
+    #  Each torch.tensor object is implicitly converted into a pointer to its first element.
+    fill_indices_kernel[grid](
+        tokens_per_expert_group,
+        start_index_values,
+        write_offsets,
+        permuted_indices,
+        experts_per_rank,
+        num_ranks,
+    )
+    return permuted_indices
+def fill_indices_cpu(
+    tokens_per_expert_group: torch.Tensor,
+    start_index_values: torch.Tensor,
+    write_offsets: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+):
+    # We need to preallocate the output.
+    permuted_indices = torch.full((max_len,), -1, dtype=torch.int32)
+    # Fill the permuted indices
+    # For each local expert
+    for e in range(experts_per_rank):
+        write_start = write_offsets[e]
+        # For each remote rank
+        for r in range(num_ranks):
+            i = r * experts_per_rank + e
+            start_index = start_index_values[i]
+            length = tokens_per_expert_group[i]
+            # Fill in the indices
+            permuted_indices[write_start : write_start + length] = torch.arange(
+                start_index, start_index + length
+            )
+            write_start += length
+    return permuted_indices
+def generate_permute_indices(
+    tokens_per_expert_group: torch.Tensor,
+    experts_per_rank: int,
+    num_ranks: int,
+    max_len: int,
+    alignment: int,
+    use_cpu: bool = False,
+):
+    # Prepare permutation indices and the number of tokens for each expert.  The
+    # permutation indices are the indices of the tokens for each expert.  The
+    # number of tokens for each expert is the sum of the number of tokens for
+    # such experts from all ranks. This number is aligned to the provided
+    # alignment requirement (usually comes from group gemm).
+    # Args:
+    #     tokens_per_expert_group: number of tokens for each expert from all ranks.
+    #     experts_per_rank: number of experts per rank.
+    #     num_ranks: number of ranks.
+    #     max_len: maximum length of the output index vector. If greater than
+    #     total number of tokens, the remaining indices are set to -1.
+    #     alignment: alignment for each returned element in `m_sizes`.
+    #     use_cpu: whether to use cpu or gpu.
+    # Returns:
+    #     permuted_indices: permutation indices.
+    #     m_sizes: number of tokens for each expert.
+    # `tokens_per_expert_group` is of shape (num_ranks * experts_per_rank,), for example:
+    # From: |       rank 0      |       rank 1      |
+    # To:   | E0 | E1 | E2 | E3 | E0 | E1 | E2 | E3 |
+    #       |  4 |  2 |  1 |  3 |  1 |  2 |  3 |  4 |
+    # Prefix sum to get the start index value of each expert
+    start_index_values = (
+        torch.cumsum(tokens_per_expert_group, 0) - tokens_per_expert_group
+    )
+    # Chunk sizes for each expert
+    chunk_size_per_expert = tokens_per_expert_group.view(num_ranks, -1).sum(0)
+    # Align the chunk sizes to the given alignment
+    m_sizes = ((chunk_size_per_expert + alignment - 1) // alignment * alignment).to(
+        torch.int32
+    )
+    # Perform another prefix sum to get the write offset of each expert in `permuted_indices`
+    write_offsets = torch.cumsum(m_sizes, 0) - m_sizes
+    # Select the method to fill the permuted indices
+    fill_fn = fill_indices_cpu if use_cpu else fill_indices
+    # Fill the permuted indices
+    permuted_indices = fill_fn(
+        tokens_per_expert_group,
+        start_index_values,
+        write_offsets,
+        experts_per_rank,
+        num_ranks,
+        max_len,
+    )
+    return permuted_indices, m_sizes
+# Below is for testing only
+def test():
+    device = torch.device("cuda", 0)
+    experts_per_rank = 4
+    num_ranks = 4
+    tokens_per_expert_group = torch.full(
+        (num_ranks * experts_per_rank,), 4, dtype=torch.int32, device=device
+    )
+    max_len = 128
+    alignment = 32
+    # Use the GPU kernel
+    permuted_indices_gpu, m_sizes = generate_permute_indices(
+        tokens_per_expert_group, experts_per_rank, num_ranks, max_len, alignment
+    )
+    # Use the CPU method
+    permuted_indices_cpu, _ = generate_permute_indices(
+        tokens_per_expert_group,
+        experts_per_rank,
+        num_ranks,
+        max_len,
+        alignment,
+        use_cpu=True,
+    )
+    # Check that the results are the same
+    assert torch.equal(permuted_indices_gpu.cpu(), permuted_indices_cpu)
+    assert torch.equal(
+        torch.remainder(m_sizes, alignment),
+        torch.zeros(experts_per_rank, device=device),
+    )
+    # Print the results
+    print(permuted_indices_gpu)
+    print(m_sizes)
+    print("Success")
+if __name__ == "__main__":
+    test()

torchtitan/experiments/deepseek_v3/symm_mem_recipes/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from .triton_on_device_all_to_all_v import OnDeviceAllToAllV
+__all__ = [
+    "OnDeviceAllToAllV",
+]

torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_on_device_all_to_all_v.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+import triton
+import triton.language as tl
+from .triton_barrier import blockwise_barrier
+from .triton_utils import sync_threads
+@triton.jit
+def _exchange_row_offsets(
+    split_sizes_ptrs,
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    BLOCKS_PER_REMOTE_RANK: tl.constexpr,
+):
+    remote_rank = tl.program_id(0) // BLOCKS_PER_REMOTE_RANK
+    # split_sizes_ptr for all ranks
+    # All these vector stacks into split_sizes_matrix
+    split_sizes_ptrs = split_sizes_ptrs.to(tl.pointer_type(tl.uint64))
+    # split_sizes_matrix[remote_rank, :]
+    input_split_sizes_ptr = tl.load(split_sizes_ptrs + remote_rank).to(
+        tl.pointer_type(tl.int64)
+    )
+    offsets_ = tl.arange(0, world_size)
+    input_split_sizes = tl.load(
+        input_split_sizes_ptr + offsets_, mask=offsets_ <= rank, other=0
+    )
+    num_rows = tl.load(input_split_sizes_ptr + rank)
+    input_row_offset = tl.sum(input_split_sizes) - num_rows
+    # split_sizes_matrix[:, rank]
+    output_split_sizes_ptrs = (
+        tl.load(split_sizes_ptrs + offsets_).to(tl.pointer_type(tl.int64)) + rank
+    )
+    output_split_sizes = tl.load(
+        output_split_sizes_ptrs, mask=offsets_ <= remote_rank, other=0
+    )
+    output_row_offset = tl.sum(output_split_sizes) - num_rows
+    return input_row_offset, output_row_offset, num_rows
+@triton.jit
+def on_device_all_to_all_v_kernel(
+    output_ptr,
+    output_splits_ptr,
+    input_ptrs,
+    input_splits_ptr,
+    signal_pad_ptrs,
+    dim: tl.constexpr,  # Separate dim for easier vectorization
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    BLOCKS_PER_REMOTE_RANK: tl.constexpr,
+    UNROLL_FACTOR: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    blockwise_barrier(signal_pad_ptrs, None, rank, world_size, sem="relaxed")
+    sync_threads()
+    remote_rank = tl.program_id(0) // BLOCKS_PER_REMOTE_RANK
+    block_offset = tl.program_id(0) % BLOCKS_PER_REMOTE_RANK
+    input_row_offset, output_row_offset, num_rows = _exchange_row_offsets(
+        input_splits_ptr, rank, world_size, BLOCKS_PER_REMOTE_RANK
+    )
+    output_splits_ptr = output_splits_ptr.to(tl.pointer_type(tl.uint64))
+    if block_offset == 0:
+        # Update output_splits
+        tl.store(output_splits_ptr + remote_rank, num_rows)
+    input_ptr = (
+        tl.load(input_ptrs.to(tl.pointer_type(tl.uint64)) + remote_rank).to(
+            tl.pointer_type(tl.bfloat16)
+        )
+        + input_row_offset * dim
+    )
+    output_ptr = output_ptr + output_row_offset * dim
+    outer_loop_step = BLOCK_SIZE * UNROLL_FACTOR
+    outer_loop_iters_per_rank = tl.cdiv(
+        tl.cdiv(num_rows * dim, outer_loop_step), BLOCKS_PER_REMOTE_RANK
+    )
+    numel_per_rank = outer_loop_step * outer_loop_iters_per_rank
+    offset = numel_per_rank * block_offset
+    end = tl.minimum(numel_per_rank * (block_offset + 1), num_rows * dim)
+    unroll_region_size = (end - offset) // outer_loop_step * outer_loop_step
+    for i in tl.range(offset, offset + unroll_region_size, outer_loop_step):
+        datas = []
+        for j in tl.range(
+            i,
+            i + outer_loop_step,
+            BLOCK_SIZE,
+            loop_unroll_factor=UNROLL_FACTOR,
+        ):
+            offsets = j + tl.arange(0, BLOCK_SIZE)
+            data = tl.load(input_ptr + offsets)
+            tl.store(output_ptr + offsets, data)
+    offset += unroll_region_size
+    while offset < end:
+        offsets = offset + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < num_rows * dim
+        data = tl.load(input_ptr + offsets, mask=mask)
+        tl.store(output_ptr + offsets, data, mask=mask)
+        offset += BLOCK_SIZE
+    sync_threads()
+    blockwise_barrier(signal_pad_ptrs, None, rank, world_size, sem="relaxed")
+    return
+def _on_device_all_to_all_v(
+    output: torch.Tensor,
+    output_splits: torch.Tensor,
+    input: torch.Tensor,
+    input_splits: torch.Tensor,
+    group: dist.ProcessGroup = dist.group.WORLD,
+    BLOCKS_PER_REMOTE_RANK=8,
+    UNROLL_FACTOR: int = 8,
+    BLOCK_SIZE: int = 16384,
+):
+    assert output.dim() == 2, f"{output.shape}"
+    assert input.dim() == 2, f"{input.shape}"
+    assert output.shape[1] == input.shape[1]
+    dim = output.shape[1]
+    input_hdl = symm_mem.rendezvous(input, group=group)
+    input_splits_hdl = symm_mem.rendezvous(input_splits, group=group)
+    num_blocks = input_hdl.world_size * BLOCKS_PER_REMOTE_RANK
+    kernel = on_device_all_to_all_v_kernel[(num_blocks, 1, 1)](
+        output,
+        output_splits,
+        input_hdl.buffer_ptrs_dev,
+        input_splits_hdl.buffer_ptrs_dev,
+        input_hdl.signal_pad_ptrs_dev,
+        dim=dim,
+        rank=input_hdl.rank,
+        world_size=input_hdl.world_size,
+        BLOCKS_PER_REMOTE_RANK=BLOCKS_PER_REMOTE_RANK,
+        UNROLL_FACTOR=UNROLL_FACTOR,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=16,
+    )
+    # log_triton_kernel(kernel)
+    return output
+class OnDeviceAllToAllV(torch.autograd.Function):
+    # A symmetric memory holding the grad_output during backward
+    grad_output_buf = None
+    # A symmetric memory for exchanges split sizes during both forward and backward
+    splits_buf = None
+    # Maximum output length (need to be set before use of OnDeviceAllToAllV)
+    max_output_len = None
+    @staticmethod
+    def forward(
+        ctx,
+        input: torch.Tensor,
+        input_splits: torch.Tensor,
+        group: dist.ProcessGroup = dist.group.WORLD,
+    ):
+        """
+        Args:
+            input: input tensor with data for all ranks concatenated.
+            input_splits: input splits of shape (group.world_size,)
+            group: process group to scope the collective.
+        """
+        # Initialize input splits buffer (one time only)
+        if OnDeviceAllToAllV.splits_buf is None:
+            OnDeviceAllToAllV.splits_buf = symm_mem.empty(
+                *input_splits.shape,
+                dtype=input_splits.dtype,
+                device=input_splits.device,
+            )
+        if OnDeviceAllToAllV.max_output_len is None:
+            raise RuntimeError(
+                "Please set max output length via `OnDeviceAllToAllV.max_output_len = ...`"
+            )
+        # Allocate output buffer
+        output = input.new_empty(OnDeviceAllToAllV.max_output_len, *input.shape[1:])
+        # Allocate output splits tensor
+        output_splits = torch.empty_like(input_splits)
+        # Copy input splits to the buffer
+        OnDeviceAllToAllV.splits_buf.copy_(input_splits)
+        # Shuffle input to output
+        _on_device_all_to_all_v(
+            output, output_splits, input, OnDeviceAllToAllV.splits_buf, group=group
+        )
+        # Output splits in forward is the input splits in backward
+        ctx.save_for_backward(output_splits)
+        ctx.group = group
+        ctx.input_shape = input.shape
+        return output, output_splits
+    @staticmethod
+    def backward(ctx, grad_output, grad_splits):
+        """
+        Backward is implemented as a shuffle of the output's gradients to the input.
+        Args:
+            `grad_output`: output's gradients passed from the downstream.
+            `grad_splits`: unused.
+        """
+        # Initialize grad_output buffer (one time only)
+        if OnDeviceAllToAllV.grad_output_buf is None:
+            assert (
+                OnDeviceAllToAllV.max_output_len is not None
+            ), "`max_output_len` not set"
+            OnDeviceAllToAllV.grad_output_buf = symm_mem.empty(
+                OnDeviceAllToAllV.max_output_len,
+                *grad_output.shape[1:],
+                dtype=grad_output.dtype,
+                device=grad_output.device,
+            )
+        # TODO: is there a way to tell autograd to feed grad_output directly to
+        # our symm_mem buffer?
+        OnDeviceAllToAllV.grad_output_buf.narrow(0, 0, grad_output.shape[0]).copy_(
+            grad_output
+        )
+        # Size info
+        (grad_output_splits,) = ctx.saved_tensors
+        OnDeviceAllToAllV.splits_buf.copy_(grad_output_splits)
+        grad_input_splits = torch.empty_like(grad_output_splits)  # unused
+        grad_input = grad_output.new_empty(*ctx.input_shape)
+        # Shuffle gradients back to the input
+        _on_device_all_to_all_v(
+            grad_input,
+            grad_input_splits,
+            OnDeviceAllToAllV.grad_output_buf,
+            OnDeviceAllToAllV.splits_buf,
+            group=ctx.group,
+        )
+        return grad_input, None, None
+# Alias
+on_device_all_to_all_v = OnDeviceAllToAllV.apply

torchtitan/experiments/deepseek_v3/train.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# torchrun --standalone --nproc-per-node 8 run.py
+import torch
+import torch.distributed as dist
+from checkpoint import load_weights_from_hf
+from model import DeepseekForCausalLM
+from model_config import deepseek_config_registry
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp import fully_shard
+from torch.distributed.pipelining import PipelineStage, Schedule1F1B
+# Use DeepSeek-V2-Lite as a proxy
+model_id = "deepseek-ai/DeepSeek-V2-Lite"
+# Run full model
+def run_full_model(
+    mesh: DeviceMesh,
+):
+    rank = dist.get_rank()
+    device_count = torch.cuda.device_count()
+    device = torch.device("cuda", rank % device_count)
+    pp_mesh = mesh["pp"]
+    ep_mesh = mesh["ep"]
+    pp_rank = pp_mesh.get_local_rank()
+    ep_rank = ep_mesh.get_local_rank()
+    pp_size = pp_mesh.size()
+    ep_size = ep_mesh.size()
+    # Get model configs
+    model_args = deepseek_config_registry[model_id]
+    # [Note]: I am making the model smaller for testing / avoiding OOM. If you
+    # have sufficient GPUs for model parallelism, you can remove this line.
+    model_args.num_hidden_layers = 16
+    # Apply model parallelism
+    model_args.ep_size = ep_size
+    model_args.num_stages = pp_size
+    model_args.stage_idx = pp_rank
+    print(model_args)
+    # Instantiate model
+    with device, mesh:
+        model = DeepseekForCausalLM(model_args)
+    # Load weights
+    load_weights_from_hf(model, model_id, device)
+    model.train()
+    # Apply data parallelism
+    fsdp_mesh = mesh["fsdp"]
+    hsdp_mesh = mesh["ep", "fsdp"]
+    # Using `reshard_after_forward=False` to implement Zero-2, i.e. sharding the
+    # optimizer (Zero-1) and gradients (Zero-2), but not the model weights.
+    # Reason: the MoE is "sparsely activated" compared to the dense model, thus
+    # it will be ineconomical re-gather the weights.
+    for layer in model.model.layers.values():
+        # Apply FSDP to experts
+        if hasattr(layer.mlp, "experts"):
+            for expert in layer.mlp.experts.values():
+                fully_shard(expert, mesh=fsdp_mesh, reshard_after_forward=False)
+        # Apply HSDP to other parts such as attention, layernorm, because they
+        # are doing DDP on EP dimension
+        fully_shard(layer, mesh=hsdp_mesh, reshard_after_forward=False)
+    # Apply HSDP on root model (lm_head, embeddings, etc)
+    fully_shard(model, mesh=hsdp_mesh, reshard_after_forward=False)
+    # Synthetic setting
+    microbatches = pp_size * 2
+    # Use Symmetric Memory for MoE token shuffle.
+    # TODO: we are rewriting `moe_on_device` function. `setup_symm_mem` is
+    # currently supported for forward only. See `generate.py`.
+    # model.setup_symm_mem(torch.bfloat16, device)
+    # Example inputs
+    torch.manual_seed(ep_rank)
+    bs = 4
+    seqlen = 128
+    x = torch.randint(model_args.vocab_size, (microbatches * bs, seqlen), device=device)
+    label = torch.rand(microbatches * bs, seqlen, model_args.vocab_size, device=device)
+    # Create loss function
+    loss_fn = torch.nn.functional.cross_entropy
+    # Run forward and backward
+    steps = 2
+    for _ in range(steps):
+        if pp_size > 1:
+            # Create pipeline stage
+            stage = PipelineStage(
+                model,
+                pp_rank,
+                pp_size,
+                device,
+                group=pp_mesh.get_group(),
+            )
+            # Create pipeline schedule
+            losses = []
+            pp_schedule = Schedule1F1B(stage, microbatches, loss_fn=loss_fn)
+            if pp_rank == 0:
+                y = pp_schedule.step(x)
+            elif pp_rank == pp_size - 1:
+                y = pp_schedule.step(target=label, losses=losses)
+                loss = torch.mean(torch.stack(losses))
+            else:
+                pp_schedule.step()
+        else:
+            y = model(x)
+            loss = loss_fn(y, label)
+            loss.backward()
+        if pp_rank == pp_size - 1:
+            print(f"logits: {y.shape}")
+            print(f"{loss=}")
+        if pp_rank == 0:
+            param = model.get_parameter("model.layers.0.self_attn.q_proj.weight")
+            print(f"{torch.linalg.norm(param.grad)=}")
+        model.zero_grad()
+    print("Backward done")
+if __name__ == "__main__":
+    mesh = dist.init_device_mesh("cuda", (2, 2, 2), mesh_dim_names=("pp", "ep", "fsdp"))
+    run_full_model(mesh)
+    dist.destroy_process_group()

torchtitan/experiments/flux/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# FLUX model in torchtitan
+## Overview
+## Usage
+First, download the autoencoder model from HuggingFace with your own access token:
+```bash
+python torchtitan/experiments/flux/scripts/download_autoencoder.py --repo_id black-forest-labs/FLUX.1-dev --ae_path ae.safetensors --hf_token <your_access_token>
+```
+This step will download the autoencoder model from HuggingFace and save it to the `torchtitan/experiments/flux/assets/autoencoder/ae.safetensors` file.
+Run the following command to train the model on a single GPU:
+```bash
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True torchrun --nproc_per_node=1 torchtitan/experiments/flux/train.py --job.config_file torchtitan/experiments/flux/train_configs/debug_model.toml
+```
+## TODO
+- [ ] Supporting for multiple GPUs is comming soon (FSDP, etc)
+- [ ] Implement test cases in CI for FLUX model. Adding more unit tests for FLUX model (eg, unit test for preprocessor, etc)
+- [ ] More parallesim support (Tensor Parallelism, Context Parallelism, etc)
+- [ ] Support for distributed checkpointing and loading
+- [ ] Implement init_weights() function to initialize the model weights
+- [ ] Implement the num_flops_per_token calculation in get_nparams_and_flops() function

torchtitan/experiments/flux/dataset/flux_dataset.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import random
+from dataclasses import dataclass
+from typing import Any, Callable, Optional
+import numpy as np
+import torch
+from datasets import Dataset, load_dataset
+from datasets.distributed import split_dataset_by_node
+from PIL import Image
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.utils.data import IterableDataset
+from torchtitan.components.dataloader import ParallelAwareDataloader
+from torchtitan.config_manager import JobConfig
+from torchtitan.experiments.flux.dataset.tokenizer import FluxTokenizer
+from torchtitan.tools.logging import logger
+def _process_cc12m_image(
+    img: Image.Image,
+    output_size: int = 256,
+) -> Optional[torch.Tensor]:
+    """Process CC12M image to the desired size."""
+    width, height = img.size
+    # Skip low resolution images
+    if width < output_size or height < output_size:
+        return None
+    if width >= height:
+        # resize height to be equal to output_size, then crop
+        new_width, new_height = math.ceil(output_size / height * width), output_size
+        img = img.resize((new_width, new_height))
+        left = random.randint(0, new_width - output_size)
+        resized_img = img.crop((left, 0, left + output_size, output_size))
+    else:
+        # resize width to be equal to output_size, the crop
+        new_width, new_height = (
+            output_size,
+            math.ceil(output_size / width * height),
+        )
+        img = img.resize((new_width, new_height))
+        lower = random.randint(0, new_width - output_size)
+        resized_img = img.crop((0, lower, output_size, lower + output_size))
+    assert resized_img.size[0] == resized_img.size[1] == output_size
+    # Skip grayscale images
+    if resized_img.mode == "L":
+        return None
+    np_img = np.array(resized_img).transpose((2, 0, 1))
+    tensor_img = torch.tensor(np_img).float() / 255.0
+    # NOTE: The following commented code is an alternative way
+    # img_transform = transforms.Compose(
+    #     [
+    #         transforms.Resize(max(output_size, output_size)),
+    #         transforms.CenterCrop((output_size, output_size)),
+    #         transforms.ToTensor(),
+    #     ]
+    # )
+    # tensor_img = img_transform(img)
+    return tensor_img
+def _flux_data_processor(
+    sample: dict[str, Any],
+    t5_tokenizer: FluxTokenizer,
+    clip_tokenizer: FluxTokenizer,
+    output_size: int = 256,
+) -> dict[str, Any]:
+    """
+    Preprocess CC12M dataset sample image and text for Flux model.
+    Args:
+        sample: A sample from dataset
+        t5_encoder: T5 encoder
+        clip_encoder: CLIP encoder
+        output_size: The output image size
+    """
+    img = _process_cc12m_image(sample["jpg"], output_size=output_size)
+    t5_tokens = t5_tokenizer.encode(sample["txt"])
+    clip_tokens = clip_tokenizer.encode(sample["txt"])
+    return {
+        "image": img,
+        "clip_tokens": clip_tokens,  # type: List[int]
+        "t5_tokens": t5_tokens,  # type: List[int]
+    }
+@dataclass
+class TextToImageDatasetConfig:
+    path: str
+    loader: Callable
+    data_processor: Callable
+DATASETS = {
+    "cc12m": TextToImageDatasetConfig(
+        path="pixparse/cc12m-wds",
+        loader=lambda path: load_dataset(path, split="train", streaming=True),
+        data_processor=_flux_data_processor,
+    ),
+}
+def _validate_dataset(
+    dataset_name: str, dataset_path: Optional[str] = None
+) -> tuple[str, Callable, Callable]:
+    """Validate dataset name and path."""
+    if dataset_name not in DATASETS:
+        raise ValueError(
+            f"Dataset {dataset_name} is not supported. "
+            f"Supported datasets are: {list(DATASETS.keys())}"
+        )
+    config = DATASETS[dataset_name]
+    path = dataset_path or config.path
+    logger.info(f"Preparing {dataset_name} dataset from {path}")
+    return path, config.loader, config.data_processor
+class FluxDataset(IterableDataset, Stateful):
+    """Dataset for FLUX text-to-image model.
+    Args:
+    dataset_name (str): Name of the dataset.
+    dataset_path (str): Path to the dataset.
+    model_transform (Transform): Callable that applies model-specific preprocessing to the sample.
+    dp_rank (int): Data parallel rank.
+    dp_world_size (int): Data parallel world size.
+    infinite (bool): Whether to loop over the dataset infinitely.
+    """
+    def __init__(
+        self,
+        dataset_name: str,
+        dataset_path: Optional[str],
+        t5_tokenizer: FluxTokenizer,
+        clip_tokenizer: FluxTokenizer,
+        job_config: Optional[JobConfig] = None,
+        dp_rank: int = 0,
+        dp_world_size: int = 1,
+        infinite: bool = False,
+    ) -> None:
+        # Force lowercase for consistent comparison
+        dataset_name = dataset_name.lower()
+        path, dataset_loader, data_processor = _validate_dataset(
+            dataset_name, dataset_path
+        )
+        ds = dataset_loader(path)
+        self.dataset_name = dataset_name
+        self._data = split_dataset_by_node(ds, dp_rank, dp_world_size)
+        self._t5_tokenizer = t5_tokenizer
+        self._clip_tokenizer = clip_tokenizer
+        self._data_processor = data_processor
+        self.job_config = job_config
+        self.infinite = infinite
+        # Variables for checkpointing
+        self._sample_idx = 0
+        self._all_samples: list[dict[str, Any]] = []
+    def _get_data_iter(self):
+        if isinstance(self._data, Dataset) and self._sample_idx == len(self._data):
+            return iter([])
+        it = iter(self._data)
+        for _ in range(self._sample_idx):
+            next(it)
+        return it
+    def __iter__(self):
+        while True:
+            for sample in self._get_data_iter():
+                # Use the dataset-specific preprocessor
+                sample_dict = self._data_processor(
+                    sample, self._t5_tokenizer, self._clip_tokenizer, output_size=256
+                )
+                # skip low quality image or image with color channel = 1
+                if sample_dict["image"] is None:
+                    logger.warning(
+                        f"Low quality image {sample['__key__']} is skipped in Flux Dataloader"
+                    )
+                    continue
+                self._all_samples.extend(sample_dict)
+                self._sample_idx += 1
+                labels = sample_dict.pop("image")
+                yield sample_dict, labels
+            if not self.infinite:
+                logger.warning(f"Dataset {self.dataset_name} has run out of data")
+                break
+            else:
+                # Reset offset for the next iteration
+                self._sample_idx = 0
+                logger.warning(f"Dataset {self.dataset_name} is being re-looped")
+    def load_state_dict(self, state_dict):
+        self._sample_idx = state_dict["sample_idx"]
+        self._all_samples = state_dict["all_samples"]
+    def state_dict(self):
+        return {
+            "all_samples": self._all_samples,
+            "sample_idx": self._sample_idx,
+        }
+def build_flux_dataloader(
+    dp_world_size: int,
+    dp_rank: int,
+    job_config: JobConfig,
+    # This parameter is not used, keep it for compatibility
+    tokenizer: FluxTokenizer | None,
+    infinite: bool = True,
+) -> ParallelAwareDataloader:
+    """Build a data loader for HuggingFace datasets."""
+    dataset_name = job_config.training.dataset
+    dataset_path = job_config.training.dataset_path
+    batch_size = job_config.training.batch_size
+    t5_encoder_name = job_config.encoder.t5_encoder
+    clip_encoder_name = job_config.encoder.clip_encoder
+    max_t5_encoding_len = job_config.encoder.max_t5_encoding_len
+    ds = FluxDataset(
+        dataset_name=dataset_name,
+        dataset_path=dataset_path,
+        t5_tokenizer=FluxTokenizer(t5_encoder_name, max_length=max_t5_encoding_len),
+        clip_tokenizer=FluxTokenizer(
+            clip_encoder_name, max_length=77
+        ),  # fix max_length for CLIP
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        infinite=infinite,
+    )
+    return ParallelAwareDataloader(
+        dataset=ds,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        batch_size=batch_size,
+    )

torchtitan/experiments/flux/dataset/tokenizer.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+from typing import List
+from torchtitan.components.tokenizer import Tokenizer
+from transformers import CLIPTokenizer, T5Tokenizer
+class FluxTokenizer(Tokenizer):
+    """
+    Tokenizing and encoding/decoding text using the T5 or Clip tokenizer.
+    Args:
+        model_path (str): Path to the tokenzier from hugging face.
+    """
+    def __init__(self, model_path: str = "t5-small", max_length: int = 77):
+        super().__init__()
+        self._n_words = 8  # TODO(jianiw): check
+        self._max_length = max_length
+        self.is_clip = model_path.startswith("openai")
+        if self.is_clip:
+            self._tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
+                model_path, max_length=max_length
+            )
+        else:
+            self._tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(
+                model_path, max_length=max_length
+            )
+    def encode(
+        self,
+        s: str,
+    ) -> List[int]:
+        """
+        Encode the prompt text into tokens.
+        """
+        tokens = self._tokenizer(
+            s,
+            truncation=True,
+            max_length=self._max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",  # return pytorch tensors, default return List[int]
+        )["input_ids"]
+        return tokens
+    def decode(self, t: List[int]) -> str:
+        """
+        Decode function. This function will not be called.
+        """
+        return self._tokenizer.decode(t)

torchtitan/experiments/flux/model/hf_embedder.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torch import nn, Tensor
+from transformers import CLIPTextModel, T5EncoderModel
+class FluxEmbedder(nn.Module):
+    def __init__(self, version: str, **hf_kwargs):
+        super().__init__()
+        self.is_clip = version.startswith("openai")
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        if self.is_clip:
+            self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(
+                version, **hf_kwargs
+            )
+        else:
+            self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
+                version, **hf_kwargs
+            )
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+    def forward(self, batch_tokens: Tensor) -> Tensor:
+        """
+        batch_tokens: [bsz, embedding_length]
+        For T5 Encoder, embeding_length is 768
+        For CLIP, embedding_length is 256
+        """
+        outputs = self.hf_module(
+            input_ids=batch_tokens.to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        return outputs[self.output_key]

torchtitan/experiments/flux/model/layers.py ADDED Viewed

	@@ -0,0 +1,286 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# imported from black-forest-labs/FLUX
+import math
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from torch import nn, Tensor
+from torchtitan.experiments.flux.model.math import attention, rope
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32)
+        / half
+    ).to(t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * self.scale
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)  # TODO(jianiw): switch to pytorch nn.RMSNorm
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(
+            self.multiplier, dim=-1
+        )
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlock(nn.Module):
+    def __init__(
+        self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False
+    ):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+    def forward(
+        self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor
+    ) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(
+            img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(
+            txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        attn = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp(
+            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
+        )
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp(
+            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
+        )
+        return img, txt
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(
+            self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1
+        )
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        # compute attention
+        attn = attention(q, k, v, pe=pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod.gate * output
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(
+            hidden_size, patch_size * patch_size * out_channels, bias=True
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x

torchtitan/experiments/flux/tests/test_flux_dataloader.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+from torchtitan.config_manager import JobConfig
+from torchtitan.experiments.flux.dataset.flux_dataset import build_flux_dataloader
+from torchtitan.tools.profiling import (
+    maybe_enable_memory_snapshot,
+    maybe_enable_profiling,
+)
+class TestFluxDataLoader:
+    def test_flux_dataloader(self):
+        dataset_name = "cc12m"
+        batch_size = 32
+        world_size = 4
+        rank = 0
+        num_steps = 10
+        path = "torchtitan.experiments.flux.flux_argparser"
+        sys.argv.append(f"--experimental.custom_args_module={path}")
+        config = JobConfig()
+        config.maybe_add_custom_args()
+        config.parse_args(
+            [
+                # Profiling options
+                # "--profiling.enable_profiling",
+                # "--profiling.profile_freq",
+                # "5",
+                # "--profiling.enable_memory_snapshot",
+                # "--profiling.save_memory_snapshot_folder",
+                # "memory_snapshot_flux",
+                "--training.dataset",
+                dataset_name,
+                "--training.batch_size",
+                str(batch_size),
+                "--encoder.t5_encoder",
+                "google/t5-v1_1-small",
+                "--encoder.clip_encoder",
+                "openai/clip-vit-large-patch14",
+                "--encoder.max_t5_encoding_len",
+                "512",
+            ]
+        )
+        with maybe_enable_profiling(
+            config, global_step=0
+        ) as torch_profiler, maybe_enable_memory_snapshot(
+            config, global_step=0
+        ) as memory_profiler:
+            dl = self._build_dataloader(
+                config,
+                world_size,
+                rank,
+            )
+            dl = iter(dl)
+            for i in range(0, num_steps):
+                input_data, labels = next(dl)
+                print(f"Step {i} image size: {labels.shape}")
+                if torch_profiler:
+                    torch_profiler.step()
+                if memory_profiler:
+                    memory_profiler.step()
+                print(len(input_data["clip_tokens"]))
+                for k, v in input_data.items():
+                    print(f"Step {i} {k} value: {type(v), v.shape}")
+                assert len(input_data) == 2  # (clip_encodings, t5_encodings)
+                assert labels.shape == (batch_size, 3, 256, 256)
+                # assert input_data["clip_tokens"].shape[0] == batch_size
+                # assert input_data["t5_tokens"].shape == (batch_size, 512, 512)
+            if torch_profiler:
+                torch_profiler.step()
+            if memory_profiler:
+                memory_profiler.step(exit_ctx=True)
+    def test_preprocess(self):
+        # TODO
+        pass
+    def _build_dataloader(
+        self,
+        job_config,
+        world_size,
+        rank,
+    ):
+        return build_flux_dataloader(
+            dp_world_size=world_size,
+            dp_rank=rank,
+            job_config=job_config,
+            tokenizer=None,
+            infinite=False,
+        )

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/fast_debug_ao.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import logging
+import numpy as np
+import torch
+from reference_utils import (
+    analyze_tensor_differences,
+    compute_reference_backward,
+    compute_reference_forward,
+)
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+# Import grouped GEMM implementations
+try:
+    from mg_grouped_gemm import grouped_gemm_backward, grouped_gemm_forward
+except ImportError:
+    logging.error(
+        "Error importing grouped GEMM modules. Make sure the implementation files are in the correct path."
+    )
+    raise
+def test_forward_pass():
+    """
+    A simple test for the M*G grouped GEMM forward pass with detailed error handling.
+    In M*G grouping:
+    - M dimension is partitioned into G groups (M_total = sum(M_sizes))
+    - N dimension is the same for all groups
+    """
+    try:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Test parameters for DeepSeek-like models
+        G = 1  # Number of groups
+        M_sizes = [
+            2048,
+        ]  # 2048, 2048, 2048]  # Group sizes (will be adjusted)
+        M_total = sum(M_sizes)  # Total M dimension
+        N = 4096  # Output dimension (same for all groups)
+        K = 7168  # Hidden dimension
+        # Create group sizes tensor
+        m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+        # Create input and weight tensors - using float16 for higher precision
+        x = torch.randn(M_total, K, dtype=torch.float16, device=device)
+        w = torch.randn(N, K, dtype=torch.float16, device=device)
+        # Log the setup
+        logging.info(f"Test setup - G: {G}, M_total: {M_total}, N: {N}, K: {K}")
+        logging.info(f"Group sizes: {m_sizes}")
+        logging.info(f"Input x shape: {x.shape}")
+        logging.info(f"Weight w shape: {w.shape}")
+        # Run forward pass
+        logging.info("Running forward pass with grouped GEMM")
+        result = grouped_gemm_forward(x, w, m_sizes)
+        logging.info(f"Forward result shape: {result.shape}")
+        # Compute reference result
+        logging.info("Computing reference result with PyTorch")
+        reference_result = compute_reference_forward(x, w, m_sizes)
+        # Compare results
+        logging.info("Comparing with PyTorch reference")
+        forward_close = analyze_tensor_differences(
+            result, reference_result, "Forward output"
+        )
+        return forward_close
+    except Exception as e:
+        logging.error(f"Test failed with error: {e}")
+        import traceback
+        logging.error(traceback.format_exc())
+        return False
+def test_backward_pass():
+    """
+    A simple test for the M*G grouped GEMM backward pass with detailed error handling.
+    In M*G grouping:
+    - M dimension is partitioned into G groups (M_total = sum(M_sizes))
+    - N dimension is the same for all groups
+    """
+    try:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Test parameters for DeepSeek-like models
+        G = 4  # Number of groups
+        M_sizes = [2048, 2048, 2048, 2048]  # Group sizes (will be adjusted)
+        M_total = sum(M_sizes)  # Total M dimension
+        N = 4096  # Output dimension (same for all groups)
+        K = 7168  # Hidden dimension
+        # Create group sizes tensor
+        m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+        # Create input and weight tensors - using float16 for higher precision
+        x = torch.randn(
+            M_total, K, dtype=torch.float16, device=device, requires_grad=True
+        )
+        w = torch.randn(N, K, dtype=torch.float16, device=device, requires_grad=True)
+        # Log the setup
+        logging.info(f"Test setup - G: {G}, M_total: {M_total}, N: {N}, K: {K}")
+        logging.info(f"Group sizes: {m_sizes}")
+        logging.info(f"Input x shape: {x.shape}")
+        logging.info(f"Weight w shape: {w.shape}")
+        # Step 1: Run forward pass
+        logging.info("Running forward pass")
+        result = grouped_gemm_forward(x, w, m_sizes)
+        logging.info(f"Forward result shape: {result.shape}")
+        # Create a gradient for backpropagation
+        grad_output = torch.randn_like(result)
+        logging.info(f"Created gradient with shape: {grad_output.shape}")
+        # Step 2: Run backward pass directly
+        logging.info("Running backward pass directly")
+        grad_x, grad_w = grouped_gemm_backward(grad_output, x, w, m_sizes)
+        # Verify gradient shapes
+        logging.info(
+            f"Gradient shapes - grad_x: {grad_x.shape}, grad_w: {grad_w.shape}"
+        )
+        # Step 3: Verify gradient computation using PyTorch's autograd
+        logging.info("Running PyTorch reference implementation")
+        # Compute reference gradients
+        x_ref_grad, w_ref_grad = compute_reference_backward(x, w, m_sizes, grad_output)
+        # Compare gradients
+        logging.info("Comparing gradients with PyTorch reference")
+        grad_x_close = analyze_tensor_differences(grad_x, x_ref_grad, "grad_x")
+        grad_w_close = analyze_tensor_differences(grad_w, w_ref_grad, "grad_w")
+        # Log overall result
+        if grad_x_close and grad_w_close:
+            logging.info("✓ SUCCESS: Gradients match the PyTorch reference")
+        else:
+            logging.error("✗ FAILURE: Gradient mismatch detected")
+        return grad_x_close and grad_w_close
+    except Exception as e:
+        logging.error(f"Test failed with error: {e}")
+        import traceback
+        logging.error(traceback.format_exc())
+        return False
+def test_multiple_deepseek_configs():
+    """
+    Test multiple DeepSeek model configurations with both forward and backward pass verification.
+    """
+    # DeepSeek configurations: (G, M, K, N)
+    configs = [
+        (4, 8192, 7168, 4096),  # Config 1
+        (4, 8192, 2048, 7168),  # Config 2
+        (8, 4096, 7168, 4096),  # Config 3
+        (8, 4096, 2048, 7168),  # Config 4
+    ]
+    results = []
+    for config_idx, (G, M, K, N) in enumerate(configs):
+        logging.info(f"\n\n===== Testing DeepSeek Config {config_idx+1} =====")
+        logging.info(f"G={G}, M={M}, K={K}, N={N}")
+        try:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            # Create even group sizes
+            base_size = M // G
+            remainder = M % G
+            M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+            m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+            # Create input and weight tensors using float16 for higher precision
+            x = torch.randn(
+                M, K, dtype=torch.float16, device=device, requires_grad=True
+            )
+            w = torch.randn(
+                N, K, dtype=torch.float16, device=device, requires_grad=True
+            )
+            logging.info(f"Input x shape: {x.shape}, Weight w shape: {w.shape}")
+            # Run forward pass
+            result = grouped_gemm_forward(x, w, m_sizes)
+            logging.info(f"Forward result shape: {result.shape}")
+            # ===== FORWARD PASS VERIFICATION =====
+            # Compute reference forward result
+            reference_result = compute_reference_forward(x, w, m_sizes)
+            # Compare forward results
+            forward_close = analyze_tensor_differences(
+                result, reference_result, "Forward output"
+            )
+            # ===== BACKWARD PASS VERIFICATION =====
+            # Create gradient for backpropagation
+            grad_output = torch.randn_like(result)
+            # Run backward pass
+            grad_x, grad_w = grouped_gemm_backward(grad_output, x, w, m_sizes)
+            # Compute reference gradients
+            x_ref_grad, w_ref_grad = compute_reference_backward(
+                x, w, m_sizes, grad_output
+            )
+            # Compare backward results
+            grad_x_close = analyze_tensor_differences(grad_x, x_ref_grad, "grad_x")
+            grad_w_close = analyze_tensor_differences(grad_w, w_ref_grad, "grad_w")
+            # Overall config result
+            backward_close = grad_x_close and grad_w_close
+            config_success = forward_close and backward_close
+            results.append(
+                (config_idx + 1, config_success, forward_close, backward_close)
+            )
+            # Log overall config result
+            if config_success:
+                logging.info(f"✓ SUCCESS: Config {config_idx+1} passed all tests!")
+            else:
+                logging.error(
+                    f"✗ FAILURE: Config {config_idx+1} failed one or more tests"
+                )
+        except Exception as e:
+            logging.error(f"Config {config_idx+1} test failed with error: {e}")
+            import traceback
+            logging.error(traceback.format_exc())
+            results.append((config_idx + 1, False, False, False))
+    # Summary
+    logging.info("\n===== Test Results Summary =====")
+    for config_idx, overall_success, forward_success, backward_success in results:
+        overall_status = "✓ PASSED" if overall_success else "✗ FAILED"
+        forward_status = "✓ PASSED" if forward_success else "✗ FAILED"
+        backward_status = "✓ PASSED" if backward_success else "✗ FAILED"
+        logging.info(f"Config {config_idx}: {overall_status}")
+        logging.info(f"  - Forward pass: {forward_status}")
+        logging.info(f"  - Backward pass: {backward_status}")
+    return all(overall_success for _, overall_success, _, _ in results)
+if __name__ == "__main__":
+    logging.info(
+        "Running verification for both forward and backward pass of M*G grouped GEMM"
+    )
+    # Run basic forward pass test
+    logging.info("\n===== Running basic forward pass test =====")
+    success_forward = test_forward_pass()
+    logging.info(f"Basic forward test {'succeeded' if success_forward else 'failed'}")
+    # Run basic backward pass test
+    logging.info("\n===== Running basic backward pass test =====")
+    success_backward = test_backward_pass()
+    logging.info(f"Basic backward test {'succeeded' if success_backward else 'failed'}")
+    # Run multiple DeepSeek configs with forward and backward verification
+    logging.info("\n===== Running tests for all DeepSeek configs =====")
+    success_configs = test_multiple_deepseek_configs()
+    logging.info(
+        f"DeepSeek configs tests {'all succeeded' if success_configs else 'had failures'}"
+    )
+    # Overall result
+    overall_success = success_forward and success_backward and success_configs
+    logging.info(
+        f"\nOverall test result: {'SUCCESS' if overall_success else 'FAILURE'}"
+    )

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/mg_grouped_gemm.py ADDED Viewed

	@@ -0,0 +1,1304 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# credit - flat index forward kernel is derived from FBGemm:
+# https://github.com/pytorch/FBGEMM/blob/main/fbgemm_gpu/experimental/gemm/triton_gemm
+# pyre-unsafe
+import functools
+import logging
+import os
+import sys
+from typing import Any, Dict, Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from triton import Config as TConfig
+from triton.runtime import driver  # @manual
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from tma_autotuning import (
+    ALIGN_SIZE_M,
+    _NV_CONFIGS,
+    CudaUtils,
+    early_config_prune,
+    TmaDescriptorHelper,
+)
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+# ==============  Start Triton Kernels ===============
+@triton.autotune(
+    configs=_NV_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={"early_config_prune": early_config_prune},
+)
+@triton.jit
+def _kernel_mg_forward_hopper(
+    a_desc_ptr,
+    b_desc_ptr,
+    c_ptr,
+    workspace,
+    m_sizes,
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET: tl.constexpr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    # config
+    NUM_SMS: tl.constexpr,
+    TMA_SIZE: tl.constexpr,
+    USE_EPILOGUE_SUBTILING: tl.constexpr,
+    # tiles
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+) -> None:
+    """
+    Flat index style forward kernel for Hopper.
+    For simplicity, we always use TMA Load and TMA Store
+    """
+    tbidx = tl.program_id(0)  # thread block index
+    c_dtype = c_ptr.dtype.element_ty  # output dtype
+    c_desc_ptr = workspace + (tbidx * TMA_SIZE)  # for TMA Store
+    M_end = 0
+    M_start = 0
+    processed_tiles = 0
+    # Size of individual weight matrix
+    n_size = N // G
+    n_start = 0
+    for g in range(G):
+        # Move down along groups
+        # reset to new M offset
+        M_start = M_end
+        m_size = tl.load(m_sizes + g)
+        M_end = M_start + m_size
+        n_start = n_size * g
+        if m_size > 0:
+            # Process this group
+            # Acquire hold on c_desc_ptr for TMA Store
+            tl.extra.cuda.experimental_device_tensormap_create2d(
+                desc_ptr=c_desc_ptr,
+                global_address=c_ptr + M_start * n_size,
+                load_size=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                global_size=[m_size, n_size],
+                element_ty=c_dtype,
+            )
+            tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(c_desc_ptr)
+            # tiles for this group
+            num_m_tiles = tl.cdiv(m_size, BLOCK_SIZE_M)
+            num_n_tiles = tl.cdiv(n_size, BLOCK_SIZE_N)
+            group_num_tiles = num_m_tiles * num_n_tiles
+            while tbidx >= processed_tiles and tbidx < (
+                processed_tiles + group_num_tiles
+            ):
+                group_index = tbidx - processed_tiles
+                # columnwise
+                tile_m_index = group_index % num_m_tiles
+                tile_n_index = group_index // num_m_tiles
+                accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+                m_offset = (M_start + (tile_m_index * BLOCK_SIZE_M)).to(tl.int32)
+                n_offset = (tile_n_index * BLOCK_SIZE_N).to(tl.int32)
+                global_n_offset = (n_start + n_offset).to(tl.int32)
+                for k_offset in range(0, K, BLOCK_SIZE_K):
+                    # input block [M,K]
+                    a = tl._experimental_descriptor_load(
+                        a_desc_ptr,
+                        [m_offset, k_offset],
+                        [BLOCK_SIZE_M, BLOCK_SIZE_K],
+                        c_dtype,
+                    )
+                    # weight block [N, K]
+                    b = tl._experimental_descriptor_load(
+                        b_desc_ptr,
+                        [global_n_offset, k_offset],
+                        [BLOCK_SIZE_N, BLOCK_SIZE_K],
+                        c_dtype,
+                    )
+                    accumulator += tl.dot(a, b.T)
+                # Store using TMA
+                m_offset = (tile_m_index * BLOCK_SIZE_M).to(tl.int32)
+                if USE_EPILOGUE_SUBTILING:
+                    acc = tl.reshape(accumulator, (BLOCK_SIZE_M, 2, BLOCK_SIZE_N // 2))
+                    acc = tl.permute(acc, (0, 2, 1))
+                    acc0, acc1 = tl.split(acc)
+                    c0 = acc0.to(c_dtype)
+                    tl._experimental_descriptor_store(
+                        c_desc_ptr, c0, [m_offset, n_offset]
+                    )
+                    c1 = acc1.to(c_dtype)
+                    tl._experimental_descriptor_store(
+                        c_desc_ptr, c1, [m_offset, n_offset + BLOCK_SIZE_N // 2]
+                    )
+                else:
+                    tl._experimental_descriptor_store(
+                        c_desc_ptr,
+                        accumulator.to(c_dtype),
+                        [m_offset, n_offset],
+                    )
+                # move to next tile in group
+                tbidx += NUM_SMS
+            # Update the total tiles count for the next group
+            processed_tiles += group_num_tiles
+@triton.autotune(
+    configs=_NV_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={"early_config_prune": early_config_prune},
+)
+@triton.jit
+def _kernel_mg_forward_tma(
+    a_desc_ptr,
+    b_desc_ptr,
+    c_ptr,
+    workspace,
+    m_sizes,
+    a_scale_ptr,
+    b_scale_ptr,
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET: tl.constexpr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    # config
+    NUM_SMS: tl.constexpr,
+    USE_TMA_LOAD: tl.constexpr,
+    USE_TMA_STORE: tl.constexpr,
+    TMA_SIZE: tl.constexpr,
+    USE_FP8: tl.constexpr,
+    # tiles
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+) -> None:
+    """
+    Flat index style forward kernel.
+    For simplicity, we always use TMA Load and TMA Store
+    """
+    tbidx = tl.program_id(0)  # thread block index
+    c_dtype = c_ptr.dtype.element_ty
+    c_desc_ptr = workspace + (tbidx * TMA_SIZE)
+    M_end = 0
+    processed_tiles = 0
+    for g in range(G):
+        # Move down along groups
+        # reset to new M offset
+        M_start = M_end
+        m_size = tl.load(m_sizes + g)
+        M_end = M_start + m_size
+        if m_size > 0:
+            # Process this group
+            n_size = N
+            # TMA Store prep
+            tl.extra.cuda.experimental_device_tensormap_create2d(
+                desc_ptr=c_desc_ptr,
+                global_address=c_ptr + M_start * N,
+                load_size=[BLOCK_SIZE_M, BLOCK_SIZE_N],
+                global_size=[m_size, n_size],
+                element_ty=c_dtype,
+            )
+            tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(c_desc_ptr)
+            # tiles for this group
+            num_m_tiles = tl.cdiv(m_size, BLOCK_SIZE_M)
+            num_n_tiles = tl.cdiv(n_size, BLOCK_SIZE_N)
+            group_num_tiles = num_m_tiles * num_n_tiles
+            while tbidx >= processed_tiles and tbidx < (
+                processed_tiles + group_num_tiles
+            ):
+                group_index = tbidx - processed_tiles
+                tile_m_index = group_index % num_m_tiles
+                tile_n_index = group_index // num_m_tiles
+                accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+                m_offset = (M_start + (tile_m_index * BLOCK_SIZE_M)).to(tl.int32)
+                n_offset = (tile_n_index * BLOCK_SIZE_N).to(tl.int32)
+                for k_offset in range(0, K, BLOCK_SIZE_K):
+                    # input block [M,K]
+                    a = tl._experimental_descriptor_load(
+                        a_desc_ptr,
+                        [m_offset, k_offset],
+                        [BLOCK_SIZE_M, BLOCK_SIZE_K],
+                        c_dtype,
+                    )
+                    # weight block [N, K]
+                    b = tl._experimental_descriptor_load(
+                        b_desc_ptr,
+                        [n_offset, k_offset],
+                        [BLOCK_SIZE_N, BLOCK_SIZE_K],
+                        c_dtype,
+                    )
+                    accumulator += tl.dot(a, b.T)
+                # Store using TMA
+                m_offset = (tile_m_index * BLOCK_SIZE_M).to(tl.int32)
+                # n_offset = (tile_n_index * BLOCK_SIZE_N).to(tl.int32)
+                tl._experimental_descriptor_store(
+                    c_desc_ptr,
+                    accumulator.to(c_dtype),
+                    [m_offset, n_offset],
+                )
+                # Move to the next tile
+                tbidx += NUM_SMS
+            # Update the total tiles count for the next group
+            processed_tiles += group_num_tiles
+@triton.autotune(
+    configs=_NV_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={"early_config_prune": early_config_prune},
+)
+@triton.jit
+def _kernel_mg_forward_no_tma(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    workspace,
+    m_sizes,
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET: tl.constexpr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    # config
+    NUM_SMS: tl.constexpr,
+    USE_TMA_LOAD: tl.constexpr,
+    USE_TMA_STORE: tl.constexpr,
+    TMA_SIZE: tl.constexpr,
+    # tiles
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+) -> None:
+    """
+    Flat index style forward kernel.
+    For bc and Ampere, we never use TMA Load and TMA Store
+    """
+    tbidx = tl.program_id(0)  # thread block index
+    c_dtype = c_ptr.dtype.element_ty
+    c_desc_ptr = None
+    M_end = 0
+    processed_tiles = 0
+    for g in range(G):
+        # Move down along groups
+        # reset to new M offset
+        M_start = M_end
+        m_size = tl.load(m_sizes + g)
+        M_end = M_start + m_size
+        if m_size > 0:
+            # Process this group
+            n_size = N
+            # tiles for this group
+            num_m_tiles = tl.cdiv(m_size, BLOCK_SIZE_M)
+            num_n_tiles = tl.cdiv(n_size, BLOCK_SIZE_N)
+            group_num_tiles = num_m_tiles * num_n_tiles
+            while tbidx >= processed_tiles and tbidx < (
+                processed_tiles + group_num_tiles
+            ):
+                group_index = tbidx - processed_tiles
+                tile_m_index = group_index % num_m_tiles
+                tile_n_index = group_index // num_m_tiles
+                accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+                m_offset = (M_start + (tile_m_index * BLOCK_SIZE_M)).to(tl.int32)
+                n_offset = (tile_n_index * BLOCK_SIZE_N).to(tl.int32)
+                offs_am = tile_m_index * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                offs_bn = tile_n_index * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                offs_k = tl.arange(0, BLOCK_SIZE_K)
+                a_ptrs = a_ptr + (M_start + offs_am[:, None]) * K + offs_k[None, :]
+                b_ptrs = b_ptr + (offs_bn[:, None]) * K + offs_k[None, :]
+                for k_offset in range(0, K, BLOCK_SIZE_K):
+                    # Load with bounds checking
+                    a = tl.load(a_ptrs, mask=offs_am[:, None] < m_size)
+                    b = tl.load(b_ptrs, mask=offs_bn[:, None] < n_size)
+                    # Main matmul
+                    accumulator += tl.dot(a, b.T)
+                    # Update pointers for next block
+                    a_ptrs += BLOCK_SIZE_K
+                    b_ptrs += BLOCK_SIZE_K
+                # Store without TMA
+                offs_am = tile_m_index * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+                offs_bn = tile_n_index * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+                c = accumulator.to(c_dtype)
+                tl.store(
+                    c_ptr
+                    + (M_start + offs_am[:, None]) * N  # Row stride is N
+                    + offs_bn[None, :],  # Column offset
+                    c,
+                    mask=offs_am[:, None] < m_size and offs_bn[None, :] < n_size,
+                )
+                # Move to the next tile
+                tbidx += NUM_SMS
+            # Update the total tiles count for the next group
+            processed_tiles += group_num_tiles
+"""
+Backward pass for grouped GEMM with Triton, where grouping is M*G
+We compute gradients with respect to both input (`grad_x`) and weights (`grad_w`).
+"""
+# ---- dx flat linear indexed ----
+@triton.autotune(
+    configs=_NV_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={"early_config_prune": early_config_prune},
+)
+@triton.jit
+def _kernel_mg_dx_tma(
+    grad_output_desc_ptr,  # [MG, N]
+    w_desc_ptr,  # [N, K]
+    grad_input_ptr,  # output grad_x [MG, K]
+    workspace,  # for TMA store
+    m_sizes,  # group sizes [G]
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET: tl.constexpr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    # config
+    NUM_SMS: tl.constexpr,
+    USE_TMA_LOAD: tl.constexpr,
+    USE_TMA_STORE: tl.constexpr,
+    TMA_SIZE: tl.constexpr,
+    # tiles
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+) -> None:
+    """
+    TMA-optimized kernel for computing gradients with respect to input (dx).
+    For the forward pass Y = X @ W.T, the backward for input is:
+    grad_X = grad_Y @ W
+    This maps to [MG, N] @ [N, K] -> [MG, K]
+    Key differences from forward:
+    1. W is used directly and not transposed
+    2. The reduction dimension is now N (not K)
+    3. Output is [M, K] instead of [M, N]
+    """
+    tbidx = tl.program_id(0)  # thread block index
+    c_dtype = grad_input_ptr.dtype.element_ty
+    c_desc_ptr = workspace + (tbidx * TMA_SIZE)
+    M_end = 0
+    processed_tiles = 0
+    for g in range(G):
+        # Move down along groups - same as forward
+        M_start = M_end
+        m_size = tl.load(m_sizes + g)
+        M_end = M_start + m_size
+        if m_size > 0:
+            # Process this group
+            # tiles for this group - now producing [M, K] output
+            num_m_tiles = tl.cdiv(m_size, BLOCK_SIZE_M)
+            num_k_tiles = tl.cdiv(K, BLOCK_SIZE_K)
+            group_num_tiles = num_m_tiles * num_k_tiles
+            # TMA Store prep for [M, K] output
+            tl.extra.cuda.experimental_device_tensormap_create2d(
+                desc_ptr=c_desc_ptr,
+                global_address=grad_input_ptr + M_start * K,
+                load_size=[BLOCK_SIZE_M, BLOCK_SIZE_K],
+                global_size=[m_size, K],
+                element_ty=c_dtype,
+            )
+            tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(c_desc_ptr)
+            while tbidx >= processed_tiles and tbidx < (
+                processed_tiles + group_num_tiles
+            ):
+                group_index = tbidx - processed_tiles
+                # Different tiling scheme for [M, K] output
+                tile_m_index = group_index % num_m_tiles
+                tile_k_index = group_index // num_m_tiles
+                # for grad_input block [M, K]
+                accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_K), dtype=tl.float32)
+                # Position in full matrix
+                m_offset = (M_start + (tile_m_index * BLOCK_SIZE_M)).to(tl.int32)
+                k_offset = (tile_k_index * BLOCK_SIZE_K).to(tl.int32)
+                # reduce along N dimension (instead of K in forward)
+                for n_offset in range(0, N, BLOCK_SIZE_N):
+                    # grad_output block [M, N]
+                    grad_output = tl._experimental_descriptor_load(
+                        grad_output_desc_ptr,
+                        [m_offset, n_offset],
+                        [BLOCK_SIZE_M, BLOCK_SIZE_N],
+                        c_dtype,
+                    )
+                    # weight block [N, K] - no transpose needed
+                    w = tl._experimental_descriptor_load(
+                        w_desc_ptr,
+                        [n_offset, k_offset],
+                        [BLOCK_SIZE_N, BLOCK_SIZE_K],
+                        c_dtype,
+                    )
+                    # grad_x = grad_output @ w
+                    # reducing along N dimension
+                    accumulator += tl.dot(grad_output, w)
+                # Store using TMA
+                m_offset = (tile_m_index * BLOCK_SIZE_M).to(tl.int32)
+                # k_offset = (tile_k_index * BLOCK_SIZE_K).to(tl.int32)
+                tl._experimental_descriptor_store(
+                    c_desc_ptr,
+                    accumulator.to(c_dtype),
+                    [m_offset, k_offset],
+                )
+                # Move to the next tile
+                tbidx += NUM_SMS
+            # Update the total tiles count for the next group
+            processed_tiles += group_num_tiles
+# ---- dw flat linear indexed ----
+@triton.autotune(
+    configs=_NV_CONFIGS,
+    key=["G", "M_BUCKET", "N", "K"],
+    prune_configs_by={"early_config_prune": early_config_prune},
+)
+@triton.jit
+def _kernel_mg_dw_tma(
+    x_desc_ptr,  # input descriptor [M_total, K]
+    grad_output_desc_ptr,  # grad_output descriptor [M_total, N]
+    grad_weight_ptr,  # output grad_w [N, K]
+    workspace,  # workspace for TMA store
+    m_sizes,  # group sizes [G]
+    # problem sizes
+    G: tl.constexpr,
+    M_BUCKET: tl.constexpr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    # config
+    NUM_SMS: tl.constexpr,
+    USE_TMA_LOAD: tl.constexpr,
+    USE_TMA_STORE: tl.constexpr,
+    TMA_SIZE: tl.constexpr,
+    # tiles
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,  # block size for reduction dimension
+) -> None:
+    """
+    Improved TMA-optimized kernel for computing gradients with respect to weights (dw).
+    Uses flat index structure similar to forward.
+    For the forward pass Y = X @ W.T,
+    the backward for weights is:
+    grad_W = grad_Y.T @ X
+    Where:
+    - grad_Y is [MG, N]
+    - X is [MG, K]
+    - grad_W is [N, K]
+    - we return [N,K]
+    """
+    # Get thread block index l
+    tbidx = tl.program_id(0)
+    # Get output data type
+    c_dtype = grad_weight_ptr.dtype.element_ty
+    # Calculate number of output tiles
+    num_n_tiles = tl.cdiv(N, BLOCK_SIZE_N)
+    num_k_tiles = tl.cdiv(K, BLOCK_SIZE_K)
+    total_output_tiles = num_n_tiles * num_k_tiles
+    # Process tiles in strided manner across SMs
+    for tile_idx in range(tbidx, total_output_tiles, NUM_SMS):
+        # Calculate tile indices
+        tile_n_idx = tile_idx % num_n_tiles
+        tile_k_idx = tile_idx // num_n_tiles
+        # Calculate global offsets
+        n_offset = tile_n_idx * BLOCK_SIZE_N
+        k_offset = tile_k_idx * BLOCK_SIZE_K
+        # Initialize accumulator for this output tile [N, K]
+        accumulator = tl.zeros((BLOCK_SIZE_N, BLOCK_SIZE_K), dtype=tl.float32)
+        # Process each group
+        M_end = 0
+        for g in range(G):
+            # Get group boundaries
+            M_start = M_end
+            m_size = tl.load(m_sizes + g)
+            M_end = M_start + m_size
+            # Only process if group is non-empty
+            if m_size > 0:
+                # Process this group in chunks along the M dimension
+                for m_offset in range(0, m_size, BLOCK_SIZE_M):
+                    # Calculate actual block size (handling boundary)
+                    m_block_size = tl.minimum(BLOCK_SIZE_M, m_size - m_offset)
+                    # Only process if we have actual work to do
+                    if m_block_size > 0:
+                        # Global offset for this chunk
+                        m_global_offset = M_start + m_offset
+                        if USE_TMA_LOAD:
+                            # Load input chunk [M_chunk, K] using TMA
+                            x_block = tl._experimental_descriptor_load(
+                                x_desc_ptr,
+                                [m_global_offset, k_offset],
+                                [BLOCK_SIZE_M, BLOCK_SIZE_K],
+                                c_dtype,
+                            )
+                            # Load grad_output chunk [M_chunk, N] using TMA
+                            grad_output_block = tl._experimental_descriptor_load(
+                                grad_output_desc_ptr,
+                                [m_global_offset, n_offset],
+                                [BLOCK_SIZE_M, BLOCK_SIZE_N],
+                                c_dtype,
+                            )
+                            # Apply masks for valid regions
+                            offs_m = tl.arange(0, BLOCK_SIZE_M)
+                            m_mask = offs_m < m_block_size
+                            # Zero out invalid elements
+                            x_block = tl.where(m_mask[:, None], x_block, 0.0)
+                            grad_output_block = tl.where(
+                                m_mask[:, None], grad_output_block, 0.0
+                            )
+                        else:
+                            # Manual load with bounds checking
+                            offs_m = tl.arange(0, BLOCK_SIZE_M)
+                            offs_n = tl.arange(0, BLOCK_SIZE_N)
+                            offs_k = tl.arange(0, BLOCK_SIZE_K)
+                            # Create masks
+                            m_mask = offs_m < m_block_size
+                            n_mask = offs_n < N - n_offset
+                            k_mask = offs_k < K - k_offset
+                            # Combined masks
+                            mk_mask = m_mask[:, None] & k_mask[None, :]
+                            mn_mask = m_mask[:, None] & n_mask[None, :]
+                            # Global offsets for loading
+                            m_global_offs = m_global_offset + offs_m
+                            # Load x block [M_chunk, K]
+                            x_block = tl.load(
+                                x_desc_ptr
+                                + m_global_offs[:, None] * K
+                                + (k_offset + offs_k)[None, :],
+                                mask=mk_mask,
+                                other=0.0,
+                            )
+                            # Load grad_output block [M_chunk, N]
+                            grad_output_block = tl.load(
+                                grad_output_desc_ptr
+                                + m_global_offs[:, None] * N
+                                + (n_offset + offs_n)[None, :],
+                                mask=mn_mask,
+                                other=0.0,
+                            )
+                        # Compute partial contribution: grad_W += grad_Y.T @ X
+                        # transpose grad_output for the matmul
+                        contribution = tl.dot(
+                            grad_output_block.to(tl.float32).T,  # [N, M_chunk]
+                            x_block.to(tl.float32),  # [M_chunk, K]
+                        )
+                        # Accumulate
+                        accumulator += contribution
+        # Store the result
+        if USE_TMA_STORE:
+            # Store using TMA
+            tl._experimental_descriptor_store(
+                workspace,  # TMA store descriptor
+                accumulator.to(c_dtype),
+                [n_offset, k_offset],
+            )
+        else:
+            # Manual store with bounds checking
+            offs_n = tl.arange(0, BLOCK_SIZE_N)
+            offs_k = tl.arange(0, BLOCK_SIZE_K)
+            # Create masks for bounds checking
+            n_mask = offs_n < N - n_offset
+            k_mask = offs_k < K - k_offset
+            output_mask = n_mask[:, None] & k_mask[None, :]
+            # Store the result
+            tl.store(
+                grad_weight_ptr
+                + (n_offset + offs_n)[:, None] * K
+                + (k_offset + offs_k)[None, :],
+                accumulator.to(c_dtype),
+                mask=output_mask,
+            )
+# ======== End Triton kernels ========
+# ======== Triton wrapper functions ========
+# ----- main forward pass wrapper -----
+def grouped_gemm_forward(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    tma_size: int = 128,
+) -> torch.Tensor:
+    """
+    M*G style grouped GEMM with TMA and Float8 support.
+    # Removed for now - FP8 support is triggered by passing x_scale and w_scale tensors.
+    """
+    if not CudaUtils.verify_tma():
+        raise NotImplementedError("Grouped GEMM without TMA is not supported yet")
+    G = m_sizes.shape[0]
+    assert x.is_contiguous()
+    assert w.is_contiguous()
+    assert m_sizes.is_contiguous()
+    # Total input size is now [M_total, K] where M_total is the sum of all group sizes
+    M_total, K = x.shape
+    N = w.shape[0]  # N is now the same for all groups
+    assert K == w.shape[1], f"Input K ({K}) must match weight K ({w.shape[1]})"
+    # Verify that all group sizes are multiples of ALIGN_SIZE_M
+    # This check is commented out because it will involve a GPU-CPU sync
+    # assert torch.remainder(m_sizes, ALIGN_SIZE_M).max() == 0, "Group sizes must be a multiple of ALIGN_SIZE_M"
+    # Create output tensor with correct shape [M_total, N]
+    y = torch.empty((M_total, N // G), device=x.device, dtype=x.dtype)
+    if M_total == 0:
+        return y
+    NUM_SMS = CudaUtils.get_num_sms()
+    USE_TMA_LOAD = True
+    USE_TMA_STORE = True
+    USE_EPILOGUE_SUBTILING = False
+    # TMA descriptor helper
+    desc_helper = None
+    desc_x = x
+    desc_w = w
+    workspace = None
+    if USE_TMA_LOAD:
+        desc_helper = TmaDescriptorHelper(tma_size=tma_size)
+        desc_helper.init_tma_descriptor("x")
+        desc_helper.init_tma_descriptor("w")
+        desc_x = desc_helper.get_tma_descriptor_kernel_param("x")
+        desc_w = desc_helper.get_tma_descriptor_kernel_param("w")
+    if USE_TMA_STORE:
+        workspace = torch.empty(
+            NUM_SMS * desc_helper.tma_size,
+            device=x.device,
+            dtype=torch.uint8,
+        )
+    def grid(META):
+        if USE_TMA_LOAD:
+            nonlocal desc_helper
+            desc_helper.fill_2d_tma_descriptor(
+                "x",
+                x.data_ptr(),
+                M_total,
+                K,
+                META["BLOCK_SIZE_M"],
+                META["BLOCK_SIZE_K"],
+                x.element_size(),
+            )
+            desc_helper.fill_2d_tma_descriptor(
+                "w",
+                w.data_ptr(),
+                N,
+                K,
+                META["BLOCK_SIZE_N"],
+                META["BLOCK_SIZE_K"],
+                w.element_size(),
+            )
+        return (NUM_SMS,)
+    M_BUCKET = triton.next_power_of_2(M_total)
+    _kernel_mg_forward_hopper[grid](
+        desc_x,
+        desc_w,
+        y,
+        workspace,
+        m_sizes,
+        G,
+        M_BUCKET,
+        N,
+        K,
+        NUM_SMS,
+        TMA_SIZE=tma_size,
+        USE_EPILOGUE_SUBTILING=USE_EPILOGUE_SUBTILING,
+    )
+    return y
+# ======== Improved Backward =============
+def grouped_gemm_backward(
+    grad_output: torch.Tensor,
+    x: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    use_tma: bool = True,
+    tma_size: int = 128,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Unified backward pass for grouped GeMM with M*G grouping.
+    Uses optimized TMA-based implementations for both dx and dw when available.
+    Args:
+        grad_output: Gradient of output, shape [M_total, N]
+        x: Input tensor from forward pass, shape [M_total, K]
+        w: Weight tensor from forward pass, shape [N, K]
+        m_sizes: Group sizes tensor, shape [G]
+        use_tma: Whether to try using TMA acceleration (if available)
+        tma_size: Size of TMA descriptor in bytes
+    Returns:
+        Tuple of gradients with respect to x and w: (grad_x, grad_w)
+    """
+    logging.info("Starting unified grouped_gemm_backward")
+    # do this once, seems expensive
+    NUM_SMS = CudaUtils.get_num_sms()
+    # Basic validation
+    G = m_sizes.shape[0]
+    M_total, K_x = x.shape
+    M_grad, N = grad_output.shape
+    N_w, K_w = w.shape
+    # Check dimensions
+    if K_x != K_w:
+        raise ValueError(f"K dimension mismatch: x has K={K_x}, w has K={K_w}")
+    if M_total != M_grad:
+        raise ValueError(
+            f"M dimension mismatch: x has M={M_total}, grad_output has M={M_grad}"
+        )
+    # Check total M matches sum of group sizes
+    sum_m_sizes = m_sizes.sum().item()
+    if M_total != sum_m_sizes:
+        raise ValueError(
+            f"Sum of m_sizes ({sum_m_sizes}) must match M_total ({M_total})"
+        )
+    # Make sure inputs are contiguous
+    grad_output = grad_output.contiguous()
+    x = x.contiguous()
+    w = w.contiguous()
+    m_sizes = m_sizes.contiguous()
+    # Check TMA support
+    can_use_tma = use_tma and CudaUtils.verify_tma()
+    if use_tma and not can_use_tma:
+        logging.info("TMA requested but not supported on this device")
+        use_tma = False
+    # Compute grad_x using flat linear implementation
+    try:
+        logging.info(f"Computing grad_x with flat linear kernel")
+        # Use TMA-optimized implementation
+        grad_x = grouped_gemm_dx_tma(
+            grad_output=grad_output,
+            w=w,
+            m_sizes=m_sizes,
+            num_sms=NUM_SMS,
+            tma_size=tma_size,
+        )
+    except Exception as e:
+        logging.error(f"Error in grad_x computation: {e}")
+        raise
+    # Compute grad_w using flat linear style implementation
+    try:
+        logging.info(f"Computing grad_w with flat linear kernel")
+        grad_w = grouped_gemm_dw_tma(
+            x, grad_output, m_sizes, num_sms=NUM_SMS, tma_size=tma_size
+        )
+    except Exception as e:
+        logging.error(f"Error in grad_w computation: {e}")
+        raise
+    return grad_x, grad_w
+# ----- dx backward pass wrapper -----
+def grouped_gemm_dx_tma(
+    grad_output: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    num_sms: int = 132,
+    tma_size: int = 128,
+) -> torch.Tensor:
+    """
+    Optimized backward pass wrapper for computing gradient with respect to input (dx)
+    using TMA patterns similar to the forward pass.
+    Args:
+        grad_output: Gradient of output, shape [M_total, N]
+        w: Weight tensor, shape [N, K]
+        m_sizes: Group sizes tensor, shape [G]
+        tma_size: Size of TMA descriptor
+        # using_fp8: Whether to use FP8 quantization
+        # grad_output_scale: Scale for grad_output in FP8 mode
+        # w_scale: Scale for w in FP8 mode
+    Returns:
+        grad_x: Gradient with respect to x, shape [M_total, K]
+    """
+    """
+    Optimized backward pass for computing gradient with respect to input (dx)
+    using TMA patterns similar to the forward pass.
+    Args:
+        grad_output: Gradient of output, shape [M_total, N]
+        w: Weight tensor, shape [N, K]
+        m_sizes: Group sizes tensor, shape [G]
+        tma_size: Size of TMA descriptor
+        using_fp8: Whether to use FP8 quantization
+        # grad_output_scale: Scale for grad_output in FP8 mode
+        # w_scale: Scale for w in FP8 mode
+    Returns:
+        grad_x: Gradient with respect to x, shape [M_total, K]
+    """
+    if not CudaUtils.verify_tma():
+        raise NotImplementedError("Optimized dx computation requires TMA support")
+    G = m_sizes.shape[0]
+    assert grad_output.is_contiguous()
+    assert w.is_contiguous()
+    assert m_sizes.is_contiguous()
+    M_total, N_grad = grad_output.shape
+    N_w, K = w.shape
+    # Check dimensions
+    assert N_grad == N_w, f"Grad_output N ({N_grad}) must match weight N ({N_w})"
+    # Verify that the sum of m_sizes matches M_total
+    sum_m_sizes = m_sizes.sum().item()
+    assert (
+        M_total == sum_m_sizes
+    ), f"Sum of m_sizes ({sum_m_sizes}) must match M_total ({M_total})"
+    # Create output tensor (grad_x) with shape [M_total, K]
+    grad_x = torch.empty(
+        (M_total, K), device=grad_output.device, dtype=grad_output.dtype
+    )
+    NUM_SMS = num_sms  # CudaUtils.get_num_sms()
+    USE_TMA_LOAD = True
+    USE_TMA_STORE = True
+    # Set up TMA descriptors
+    desc_helper = TmaDescriptorHelper(tma_size=tma_size)
+    desc_helper.init_tma_descriptor("grad_output")
+    desc_helper.init_tma_descriptor("w")
+    desc_grad_output = desc_helper.get_tma_descriptor_kernel_param("grad_output")
+    desc_w = desc_helper.get_tma_descriptor_kernel_param("w")
+    # Allocate workspace for TMA store
+    workspace = torch.empty(
+        NUM_SMS * desc_helper.tma_size,
+        device=grad_output.device,
+        dtype=torch.uint8,
+    )
+    def grid(META):
+        # Fill TMA descriptors with appropriate dimensions
+        desc_helper.fill_2d_tma_descriptor(
+            "grad_output",
+            grad_output.data_ptr(),
+            M_total,
+            N_grad,
+            META["BLOCK_SIZE_M"],
+            META["BLOCK_SIZE_N"],
+            grad_output.element_size(),
+        )
+        desc_helper.fill_2d_tma_descriptor(
+            "w",
+            w.data_ptr(),
+            N_w,
+            K,
+            META["BLOCK_SIZE_N"],
+            META["BLOCK_SIZE_K"],
+            w.element_size(),
+        )
+        return (NUM_SMS,)
+    M_BUCKET = triton.next_power_of_2(M_total)
+    # Launch the flat linear kernel for computing grad_x
+    _kernel_mg_dx_tma[grid](
+        desc_grad_output,
+        desc_w,
+        grad_x,
+        workspace,
+        m_sizes,
+        G,
+        M_BUCKET,
+        N_grad,  # N dimension is now the reduction dimension
+        K,
+        NUM_SMS,
+        USE_TMA_LOAD,
+        USE_TMA_STORE,
+        TMA_SIZE=tma_size,
+    )
+    return grad_x
+# ======== dw wrapper function ==========
+def grouped_gemm_dw_tma(
+    x: torch.Tensor,
+    grad_output: torch.Tensor,
+    m_sizes: torch.Tensor,
+    num_sms: int = 132,
+    tma_size: int = 128,
+) -> torch.Tensor:
+    """
+    Optimized flat linear kernel computation of gradients with respect to weights (dw) using TMA.
+    For the forward pass Y = X @ W.T, the backward for weights is:
+    grad_W = grad_Y.T @ X
+    Args:
+        x: Input tensor, shape [M_total, K]
+        grad_output: Gradient of output, shape [M_total, N]
+        m_sizes: Group sizes tensor, shape [G]
+        tma_size: Size of TMA descriptor in bytes
+    Returns:
+        grad_w: Gradient with respect to weights, shape [N, K]
+    """
+    # Check TMA support
+    has_tma_support = CudaUtils.verify_tma()
+    # Get group count
+    G = m_sizes.shape[0]
+    # Ensure contiguous tensors
+    x = x.contiguous()
+    grad_output = grad_output.contiguous()
+    m_sizes = m_sizes.contiguous()
+    # Get dimensions
+    M_total, K_x = x.shape
+    M_grad, N = grad_output.shape
+    # Check dimensions
+    assert M_total == M_grad, f"x M ({M_total}) must match grad_output M ({M_grad})"
+    # Verify that the sum of m_sizes matches M_total
+    sum_m_sizes = m_sizes.sum().item()
+    assert (
+        sum_m_sizes == M_total
+    ), f"Sum of m_sizes ({sum_m_sizes}) must match M_total ({M_total})"
+    # Create output tensor (grad_w) with shape [N, K]
+    grad_w = torch.zeros((N, K_x), device=x.device, dtype=x.dtype)
+    NUM_SMS = num_sms
+    # TODO  - hardcoded for now...but should set TMA flags based on hardware support
+    USE_TMA_LOAD = True  # has_tma_support
+    USE_TMA_STORE = True  # has_tma_support
+    # Set up TMA descriptors or direct pointers
+    if USE_TMA_LOAD or USE_TMA_STORE:
+        desc_helper = TmaDescriptorHelper(tma_size=tma_size)
+        if USE_TMA_LOAD:
+            desc_helper.init_tma_descriptor("x")
+            desc_helper.init_tma_descriptor("grad_output")
+            x_desc = desc_helper.get_tma_descriptor_kernel_param("x")
+            grad_output_desc = desc_helper.get_tma_descriptor_kernel_param(
+                "grad_output"
+            )
+        else:
+            x_desc = x
+            grad_output_desc = grad_output
+        if USE_TMA_STORE:
+            desc_helper.init_tma_descriptor("grad_w")
+            workspace = desc_helper.get_tma_descriptor_kernel_param("grad_w")
+        else:
+            workspace = torch.empty(1, device=x.device, dtype=torch.uint8)
+    else:
+        # If not using TMA, just use the tensors directly
+        x_desc = x
+        grad_output_desc = grad_output
+        workspace = torch.empty(1, device=x.device, dtype=torch.uint8)
+    # M_BUCKET for grid size
+    M_BUCKET = triton.next_power_of_2(M_total)
+    # Define grid for kernel launch
+    def grid(META):
+        if USE_TMA_LOAD or USE_TMA_STORE:
+            if USE_TMA_LOAD:
+                desc_helper.fill_2d_tma_descriptor(
+                    "x",
+                    x.data_ptr(),
+                    M_total,
+                    K_x,
+                    META["BLOCK_SIZE_M"],
+                    META["BLOCK_SIZE_K"],
+                    x.element_size(),
+                )
+                desc_helper.fill_2d_tma_descriptor(
+                    "grad_output",
+                    grad_output.data_ptr(),
+                    M_total,
+                    N,
+                    META["BLOCK_SIZE_M"],
+                    META["BLOCK_SIZE_N"],
+                    grad_output.element_size(),
+                )
+            if USE_TMA_STORE:
+                desc_helper.fill_2d_tma_descriptor(
+                    "grad_w",
+                    grad_w.data_ptr(),
+                    N,
+                    K_x,
+                    META["BLOCK_SIZE_N"],
+                    META["BLOCK_SIZE_K"],
+                    grad_w.element_size(),
+                )
+        # Return grid size - one block per SM for balanced work distribution
+        return (NUM_SMS,)
+    # Launch the optimized kernel
+    _kernel_mg_dw_tma[grid](
+        x_desc,
+        grad_output_desc,
+        grad_w,
+        workspace,
+        m_sizes,
+        G,
+        M_BUCKET,
+        N,
+        K_x,
+        NUM_SMS,
+        USE_TMA_LOAD,
+        USE_TMA_STORE,
+        TMA_SIZE=tma_size,
+    )
+    return grad_w
+# ======== End Backwards Wrapper Functions =============
+# ======== PyTorch wrapper functions ========
+class GroupedGEMM_mg(torch.autograd.Function):
+    """
+    Autograd function for GroupedGEMM with M*G grouping.
+    Supports both standard and FP8 quantized operations.
+    """
+    @staticmethod
+    def forward(ctx, x, w, m_sizes, use_tma=True, tma_size=128):
+        """
+        Forward pass of GroupedGEMM.
+        Args:
+            x: Input tensor, shape [M_total, K]
+            w: Weight tensor, shape [N, K]
+            m_sizes: Tensor of shape [G] containing the size of each group
+            use_tma: Whether to try using TMA acceleration (if available)
+            tma_size: Size of TMA descriptor in bytes
+            using_fp8: Whether to use FP8 quantization
+        Returns:
+            Output tensor, shape [M_total, N]
+        """
+        # Use regular forward without quantization
+        output = grouped_gemm_forward(
+            x=x, w=w, m_sizes=m_sizes, tma_size=tma_size, using_fp8=False
+        )
+        # Save inputs and parameters for backward pass
+        ctx.save_for_backward(x, w, m_sizes)
+        ctx.use_tma = use_tma
+        ctx.tma_size = tma_size
+        ctx.save_for_backward(x, w, m_sizes)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        Backward pass of M*G GroupedGEMM.
+        Args:
+            grad_output: Gradient of output, shape [M_total, N]
+        Returns:
+            Tuple of gradients:
+                - grad_x: Gradient with respect to x, shape [M_total, K]
+                - grad_w: Gradient with respect to w, shape [N, K]
+                - None: Gradient with respect to m_sizes (not differentiable)
+                - None: Gradient with respect to use_tma (not differentiable)
+                - None: Gradient with respect to tma_size (not differentiable)
+        """
+        # Retrieve saved tensors and parameters
+        x, w, m_sizes = ctx.saved_tensors
+        use_tma = ctx.use_tma
+        tma_size = ctx.tma_size
+        # Compute gradients using the unified implementation
+        grad_x, grad_w = grouped_gemm_backward(
+            grad_output=grad_output,
+            x=x,
+            w=w,
+            m_sizes=m_sizes,
+            use_tma=use_tma,
+            tma_size=tma_size,
+        )
+        # Return gradients for all inputs (None for non-differentiable parameters)
+        return grad_x, grad_w, None, None
+def mg_grouped_gemm(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    m_sizes: torch.Tensor,
+    use_tma: bool = True,
+    tma_size: int = 128,
+    using_fp8: bool = False,
+) -> torch.Tensor:
+    """
+    Unified differentiable grouped GEMM operation for M*G grouped GEMM.
+    Supports both standard precision and FP8 quantized operations.
+    Args:
+        x: Input tensor, shape [M_total, K]
+        w: Weight tensor, shape [N, K]
+        m_sizes: Tensor of shape [G] containing the size of each group
+        use_tma: Whether to try using TMA acceleration (if available)
+        tma_size: Size of TMA descriptor in bytes
+        using_fp8: Whether to use FP8 quantization
+    Returns:
+        Output tensor, shape [M_total, N]
+    """
+    return GroupedGEMM_mg.apply(x, w, m_sizes, use_tma, tma_size, using_fp8)

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/reference_utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import logging
+import numpy as np
+import torch
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+def compute_reference_forward(x, w, m_sizes):
+    """
+    Compute reference forward pass using PyTorch operations.
+    Args:
+        x (torch.Tensor): Input tensor of shape (M, K)
+        w (torch.Tensor): Weight tensor of shape (N, K)
+        m_sizes (torch.Tensor): Group sizes tensor of shape (G)
+    Returns:
+        torch.Tensor: Reference output tensor of shape (M, N)
+    """
+    result = torch.zeros((x.shape[0], w.shape[0]), dtype=x.dtype, device=x.device)
+    m_start = 0
+    for g in range(len(m_sizes)):
+        m_size = m_sizes[g].item()
+        if m_size > 0:
+            m_end = m_start + m_size
+            # Extract group input
+            x_g = x[m_start:m_end]
+            # Compute group output: y_g = x_g @ w.T
+            y_g = torch.matmul(x_g, w.T)
+            # Store result
+            result[m_start:m_end] = y_g
+            # Update start index
+            m_start = m_end
+    return result
+def compute_reference_backward(x, w, m_sizes, grad_output):
+    """
+    Compute reference backward pass using PyTorch autograd.
+    Args:
+        x (torch.Tensor): Input tensor of shape (M, K)
+        w (torch.Tensor): Weight tensor of shape (N, K)
+        m_sizes (torch.Tensor): Group sizes tensor of shape (G)
+        grad_output (torch.Tensor): Gradient tensor of shape (M, N)
+    Returns:
+        tuple: (grad_x, grad_w) gradient tensors
+    """
+    # Create autograd-enabled copies
+    x_autograd = x.detach().clone().requires_grad_(True)
+    w_autograd = w.detach().clone().requires_grad_(True)
+    # Compute forward pass
+    output = compute_reference_forward(x_autograd, w_autograd, m_sizes)
+    # Backpropagate
+    output.backward(grad_output)
+    return x_autograd.grad, w_autograd.grad
+def analyze_tensor_differences(actual, expected, name):
+    """
+    Analyze differences between actual and expected tensors.
+    Args:
+        actual (torch.Tensor): Actual tensor
+        expected (torch.Tensor): Expected tensor
+        name (str): Name of the tensor for logging
+    Returns:
+        bool: True if tensors are close enough
+    """
+    rtol = 0.5  # Relative tolerance for float16
+    atol = 0.5  # Absolute tolerance for float16
+    # Analyze differences
+    diff = (actual - expected).abs()
+    max_idx = diff.argmax().item()
+    idx = np.unravel_index(max_idx, actual.shape)
+    max_diff = diff.max().item()
+    logging.info(f"Largest {name} difference: {max_diff} at {idx}")
+    logging.info(f"Values: {actual[idx].item()} vs {expected[idx].item()}")
+    is_close = torch.allclose(actual, expected, rtol=rtol, atol=atol)
+    if is_close:
+        logging.info(f"✓ SUCCESS: {name} matches PyTorch reference")
+    else:
+        logging.error(f"✗ FAILURE: {name} mismatch detected")
+        # Count zeros
+        zeros_actual = (actual == 0).sum().item()
+        zeros_expected = (expected == 0).sum().item()
+        logging.info(
+            f"Zeros in {name} (actual): {zeros_actual}/{actual.numel()} ({zeros_actual/actual.numel()*100:.2f}%)"
+        )
+        logging.info(
+            f"Zeros in {name} (expected): {zeros_expected}/{expected.numel()} ({zeros_expected/expected.numel()*100:.2f}%)"
+        )
+        # Check for NaNs
+        nan_actual = torch.isnan(actual).sum().item()
+        if nan_actual > 0:
+            logging.error(f"NaN values detected in {name}: {nan_actual}")
+    return is_close

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/tma_autotuning.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# credit - TMAHelper class, AutoTuning are derived from FBGemm:
+# https://github.com/pytorch/FBGEMM/blob/main/fbgemm_gpu/experimental/gemm/triton_gemm
+# pyre-unsafe
+import functools
+import os
+import sys
+from typing import Any, Dict, Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from triton import Config as TConfig
+from triton.runtime import driver  # @manual
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# ===== Supporting utils, CUDA and TMA =====
+class CudaUtils:
+    @staticmethod
+    def is_cuda() -> bool:
+        """Check if Triton is running on CUDA backend."""
+        return driver.active.get_current_target().backend == "cuda"
+    @staticmethod
+    def verify_tma() -> bool:
+        """Check if TMA is supported on the current device."""
+        return (
+            CudaUtils.is_cuda()
+            and torch.cuda.is_available()
+            and torch.cuda.get_device_capability()[0] >= 9
+        )
+    @staticmethod
+    def get_num_sms() -> int:
+        """Get the number of streaming multiprocessors on the current device."""
+        if not CudaUtils.is_cuda():
+            raise RuntimeError("Triton is not running on CUDA backend")
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available")
+        return torch.cuda.get_device_properties("cuda").multi_processor_count
+class TmaDescriptorHelper:
+    """Helper class for managing TMA descriptors in Triton kernels."""
+    class KernelParamWrapper:
+        """Wrapper to implement the TmaDescKernelParam interface."""
+        def __init__(self, desc: torch.Tensor):
+            self.desc = desc
+        def tma_desc_cpu_ptr(self) -> int:
+            """Return the CPU pointer to the TMA descriptor."""
+            return self.desc.data_ptr()
+    def __init__(self, tma_size: int = 128):
+        """Initialize the TMA descriptor helper.
+        Args:
+            tma_size: Size of the TMA descriptor in bytes
+        """
+        if not CudaUtils.verify_tma():
+            raise RuntimeError(
+                "TMA not supported on this device (requires Hopper or newer)"
+            )
+        if "nv_tma_desc_type" not in dir(tl):
+            raise RuntimeError(
+                "TMA grid constant descriptors not supported in your Triton version"
+            )
+        self.tma_size = tma_size
+        self.fill_1d_tma_descriptor_inner = driver.active.utils.fill_1d_tma_descriptor
+        self.fill_2d_tma_descriptor_inner = driver.active.utils.fill_2d_tma_descriptor
+        self.descriptors: Dict[str, torch.Tensor] = {}
+    def init_tma_descriptor(self, name: str) -> None:
+        """Initialize a TMA descriptor with the given name.
+        Call this method outside of the lambda function for grid size.
+        """
+        self.descriptors[name] = torch.empty(
+            self.tma_size, device="cpu", dtype=torch.int8
+        )
+    def fill_1d_tma_descriptor(
+        self, name: str, ptr: int, dim: int, block_dim: int, element_size: int
+    ) -> None:
+        """Fill a 1D TMA descriptor.
+        Call this method inside the lambda function for grid size.
+        """
+        if name not in self.descriptors:
+            raise ValueError(f"TMA descriptor '{name}' not initialized")
+        desc_x = self.descriptors[name]
+        if desc_x.data_ptr() % 64 != 0:
+            raise ValueError("TMA descriptor must be 64-byte aligned")
+        self.fill_1d_tma_descriptor_inner(
+            ptr, dim, block_dim, element_size, desc_x.data_ptr()
+        )
+    def fill_2d_tma_descriptor(
+        self,
+        name: str,
+        ptr: int,
+        dim1: int,
+        dim0: int,
+        block_dim1: int,
+        block_dim0: int,
+        element_size: int,
+    ) -> None:
+        """Fill a 2D TMA descriptor.
+        Call this method inside the lambda function for grid size.
+        """
+        if name not in self.descriptors:
+            raise ValueError(f"TMA descriptor '{name}' not initialized")
+        desc_x = self.descriptors[name]
+        if desc_x.data_ptr() % 64 != 0:
+            raise ValueError("TMA descriptor must be 64-byte aligned")
+        self.fill_2d_tma_descriptor_inner(
+            ptr, dim1, dim0, block_dim1, block_dim0, element_size, desc_x.data_ptr()
+        )
+    def get_tma_descriptor_kernel_param(self, name: str) -> KernelParamWrapper:
+        """Get the TMA descriptor kernel parameter for the given name."""
+        if name not in self.descriptors or self.descriptors[name] is None:
+            raise ValueError(f"TMA descriptor '{name}' not initialized")
+        return self.KernelParamWrapper(self.descriptors[name])
+# ======  Autotuning utilities ======
+ALIGN_SIZE_M = 128
+_NV_CONFIGS = [
+    triton.Config(
+        {
+            "BLOCK_SIZE_M": block_size_m,
+            "BLOCK_SIZE_N": block_size_n,
+            "BLOCK_SIZE_K": block_size_k,
+        },
+        num_stages=num_stages,
+        num_warps=num_warps,
+        num_ctas=num_ctas,
+    )
+    for block_size_m in [ALIGN_SIZE_M, ]
+    for block_size_n in [64, 128, 256]
+    for block_size_k in [64, 128, 256]
+    for num_stages in [3, 4]
+    for num_warps in [4, 8]
+    for num_ctas in [1]
+]
+def early_config_prune(configs, named_args, dtsize=None, dtype=None, **kwargs):
+    device = torch.cuda.current_device()
+    # Check for all possible pointer parameter names
+    if "grad_input_ptr" in named_args:
+        ptr_name = "grad_input_ptr"
+    elif "c_ptr" in named_args:
+        ptr_name = "c_ptr"
+    elif "grad_weight_ptr" in named_args:
+        ptr_name = "grad_weight_ptr"
+    else:
+        raise KeyError("No recognized pointer parameter found in kernel arguments")
+    if dtsize is None:
+        dtsize = named_args[ptr_name].element_size()
+    if dtype is None:
+        dtype = named_args[ptr_name].dtype
+    pruned_configs = []
+    for config in configs:
+        kw = config.kwargs
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages = (
+            kw["BLOCK_SIZE_M"],
+            kw["BLOCK_SIZE_N"],
+            kw["BLOCK_SIZE_K"],
+            config.num_stages,
+        )
+        G, M, N, K = (
+            named_args["G"],
+            named_args["M_BUCKET"],
+            named_args["N"],
+            named_args["K"],
+        )
+        # 1. make sure we have enough smem
+        max_shared_memory = driver.active.utils.get_device_properties(device)[
+            "max_shared_mem"
+        ]
+        required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+        if required_shared_memory > max_shared_memory:
+            continue
+        M_PER_GROUP = M // G
+        MIN_M_TILES = 64
+        # 2. make sure we don't load M tiles that are too big
+        if BLOCK_M > MIN_M_TILES and BLOCK_M > (M_PER_GROUP * 2):
+            continue
+        # 3. make sure we don't load N tiles that are too small
+        if BLOCK_M < 128 and BLOCK_M < (M_PER_GROUP // 2):
+            continue
+        num_sm = driver.active.utils.get_device_properties(device)[
+            "multiprocessor_count"
+        ]
+        N_TILES = N // BLOCK_N
+        MIN_N_TILES = 64
+        # 4. make sure we don't load N tiles that are too big
+        if BLOCK_N > MIN_N_TILES and M * N_TILES < num_sm:
+            continue
+        # 5. make sure we don't load N tiles that are too small
+        if BLOCK_N < 128 and M * N_TILES > 2 * num_sm:
+            continue
+        # 6. make sure K can be evenly divided
+        if K % BLOCK_K != 0:
+            continue
+        pruned_configs.append(config)
+    return pruned_configs
+# ======== End Autotuning utilities ========

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import logging
+import unittest
+from typing import Tuple
+import torch
+import torch.nn as nn
+from mg_grouped_gemm import grouped_gemm_forward
+class TestMG_GroupedGEMM(unittest.TestCase):
+    def setUp(self) -> None:
+        torch.manual_seed(2020)
+    def _run_grouped_gemm_test(
+        self,
+        shape: Tuple[int, int, int, int],
+        device: torch.device,
+        dtype: torch.dtype = torch.bfloat16,
+        atol: float = 1e-5,
+        rtol: float = 1.6e-2,
+    ) -> None:
+        G, M, N, K = shape
+        # In M*G grouping, input is [M*G, K] and weights are [N*G, K]
+        a = torch.randn(M * G, K, dtype=dtype, device=device)
+        b = torch.randn(N * G, K, dtype=dtype, device=device)
+        # Create equal-sized groups for simplicity
+        m_size = M
+        m_sizes = torch.full((G,), m_size, device=device, dtype=torch.int32)
+        result = grouped_gemm_forward(a, b, m_sizes)
+        self.assertTrue(result.shape == (M * G, N))
+        expected_result = torch.zeros(M * G, N, dtype=dtype, device=device)
+        m_start = 0
+        for g in range(G):
+            m_end = m_start + m_sizes[g]
+            b_slice = b[N * g : N * (g+1), :]
+            expected_result[m_start:m_end, :] = a[m_start:m_end, :] @ b_slice.T
+            m_start = m_end
+        # Convert result to match input dtype if needed
+        result = result.to(dtype)
+        torch.testing.assert_close(result, expected_result, atol=atol, rtol=rtol)
+    def test_MG_grouped_gemm_bf16(self) -> None:
+        for G in (1, 4, 16):
+            for M in (128, 512, 1024):
+                print(f"Testing BF16 M*G GroupGeMM with G={G}, M={M}")
+                self._run_grouped_gemm_test(
+                    (G, M, 1024, 1024),
+                    torch.device("cuda"),
+                    dtype=torch.bfloat16,
+                    atol=1e-5,
+                    rtol=1.6e-2,
+                )
+    def test_MG_grouped_gemm_deepseek_shapes(self) -> None:
+        """Test with shapes from Deepseek model."""
+        deepseek_shapes = [
+            (4, 2048, 4096, 7168),  # G, M, N, K
+            (4, 2048, 7168, 2048),
+            (8, 512, 4096, 7168),
+            (8, 512, 7168, 2048),
+        ]
+        device = torch.device("cuda")
+        for shape in deepseek_shapes:
+            G, M, N, K = shape
+            print(f"Testing BF16 M*G Deepseek shape: G={G}, M={M}, N={N}, K={K}")
+            self._run_grouped_gemm_test(
+                shape, device, dtype=torch.bfloat16, atol=1e-5, rtol=1.6e-2
+            )