CarlOwOs commited on Jan 20

Commit

8533328

verified ·

1 Parent(s): 95c12ea

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

LICENSE +21 -0
README.md +519 -0
added_tokens.json +28 -0
chat_template.jinja +89 -0
config.json +44 -0
configs/delta_net_1B.json +29 -0
configs/delta_net_340M.json +26 -0
configs/gated_deltanet_1B.json +22 -0
configs/gated_deltanet_340M.json +22 -0
configs/gay_14B.json +32 -0
configs/gay_1B.json +32 -0
configs/gayted_deltanet_1B.json +25 -0
configs/gla_340M.json +24 -0
configs/gla_7B.json +25 -0
configs/gsa_1B.json +29 -0
configs/gsa_340M.json +29 -0
configs/hgrn2_340M.json +20 -0
configs/mamba2_1B.json +32 -0
configs/mamba2_340M.json +32 -0
configs/mamba_1B.json +30 -0
configs/mamba_340M.json +30 -0
configs/routmem_1.7B.json +35 -0
configs/routmem_14B.json +35 -0
configs/routmem_340M.json +33 -0
configs/samba_1B.json +52 -0
configs/sba_340m.json +18 -0
configs/transformer_1B.json +22 -0
configs/transformer_340M.json +18 -0
configs/transformer_7B.json +21 -0
flame/__init__.py +1 -0
flame/components/__init__.py +0 -0
flame/components/checkpoint.py +59 -0
flame/config_manager.py +960 -0
flame/data.py +756 -0
flame/models/fla.toml +67 -0
flame/train.py +624 -0
flame/utils/__init__.py +0 -0
flame/utils/convert_dcp_to_hf.py +67 -0
flame/utils/convert_hf_to_dcp.py +36 -0
flame/utils/preprocess.py +122 -0
generation_config.json +6 -0
merges.txt +0 -0
model.safetensors.index.json +410 -0
pyproject.toml +55 -0
setup.py +51 -0
special_tokens_map.json +31 -0
tokenizer_config.json +239 -0
train.sh +122 -0
uv.lock +0 -0
vocab.json +0 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023-2025 Songlin Yang, Yu Zhang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,519 @@

+<div align="center">
+# 🔥 Flame: Flash Language Modeling Made Easy
+[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/fla-org/flame)
+</div>
+Welcome to 🔥 `flame`, a minimal and efficient framework built on `torchtitan` for language models with blazing efficiency.
+**Feature Highlights:**
+- 🚀 Minimal, easy-to-use, extensible training framework
+- 🤗 Seamless integration with `fla` and `transformers`
+- 🔄 Zero-cost data preprocessing: online tokenization, dataset shuffling, and multiple datasets support
+- 🔮 4D parallelism (coming soon)
+## Setup
+To get started, clone the `flame` repository and install the required dependencies:
+```bash
+git clone https://github.com/fla-org/flame.git
+cd flame
+pip install .
+```
+Install the latest version of fla
+```
+pip uninstall flash-linear-attention && pip install -U --no-use-pep517 git+https://github.com/fla-org/flash-linear-attention
+```
+[Important] Install specific version of torchtitan
+```
+pip install git+https://github.com/pytorch/torchtitan.git@0b44d4c
+```
+## Dataset Preparation
+To download the dataset to your local disk, create a new Python file with the following content and execute it:
+```py
+from datasets import load_dataset
+# load fineweb-edu with parallel processing
+dataset = load_dataset("HuggingFaceFW/fineweb-edu", name="default", num_proc=64, cache_dir="/your/cache/path")
+# or load a subset with roughly 100B tokens, suitable for small- or medium-sized experiments
+dataset = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-100BT", num_proc=64, cache_dir="/your/cache/path")
+```
+## Training Recipes
+Here's an example of training a 340M FLA Transformer model with a LLaMA-like architecture from scratch on a 100BT subset of the Fineweb-edu corpus ~~in streaming mode~~. (Do not use streaming mode if you are concerned about resuming training.)
+> [!WARNING]
+> If the dataset is not downloaded beforehand, the streaming mode will attempt to fetch it from a remote server and download it on-the-fly, which can be highly unstable during training due to network issues.
+> For stable training, ensure the dataset is downloaded locally (see [**Dataset Preparation**](#dataset-preparation)). Otherwise, we assume you are only testing the new corpus.
+```sh
+bash train.sh \
+  --job.config_file flame/models/fla.toml \
+  --job.dump_folder exp/transformer-340M-4K-10B/batch1.seqlen65536.context4096.warmup1024.update1.steps20480.lr1e-3.cosine \
+  --model.config configs/transformer_340M.json \
+  --model.tokenizer_path fla-hub/transformer-1.3B-100B \
+  --optimizer.name AdamW \
+  --optimizer.eps 1e-15 \
+  --optimizer.lr 1e-3 \
+  --lr_scheduler.warmup_steps 1024 \
+  --lr_scheduler.lr_min 0.1 \
+  --lr_scheduler.decay_type cosine \
+  --training.batch_size 1 \
+  --training.seq_len 65536 \
+  --training.context_len 4096 \
+  --training.varlen \
+  --training.gradient_accumulation_steps 1 \
+  --training.steps 20480 \
+  --training.max_norm 1.0 \
+  --training.skip_nan_inf \
+  --training.dataset HuggingFaceFW/fineweb-edu \
+  --training.dataset_name sample-100BT \
+  --training.dataset_split train \
+  --training.num_workers 32 \
+  --training.prefetch_factor 2 \
+  --training.seed 42 \
+  --training.compile \
+  --checkpoint.interval 2048 \
+  --checkpoint.load_step -1 \
+  --checkpoint.keep_latest_k 2 \
+  --metrics.log_freq 1
+```
+You can specify the number of GPUs by setting the environment variable `NGPU`, which defaults to 8.
+**For single-GPU debugging, set `NGPU=1`.**
+We provide several [config files](https://github.com/fla-org/flame/tree/main/configs) for different models.
+By default, the learning rate is set to 1e-3 with a cosine scheduler. Other schedulers, such as WSD (wsd), are also supported.
+**Key parameters:**
+- `--lr_scheduler.decay_ratio`: The proportion of the steps allocated to the decay phase. The learning rate will remain stable after the warmup period and only start decaying during the last `decay_ratio` portion of the total training steps, which is known as the Warmup-Stable-Decay (WSD) schedule.
+- `--lr_scheduler.warmup_steps`: The number of steps for the learning rate warmup phase.
+- `--training.steps`: Total number of training steps.
+- `--training.batch_size`: Batch size per device, must be 1 if `--training.varlen` is set.
+- `--training.seq_len`: The length of each sequence in the batch, which is concatenated from multiple samples.
+- `--training.context_len`: The max allowed length of a sample. For non-varlen mode, this is equivalent to `seq_len`.
+- `--training.varlen`: Whether to conduct variable-length sequence training.
+- `--training.gradient_accumulation_steps`: Number of gradient accumulation steps.
+> [!WARNING]
+> The total number of tokens processed per batch, referred to as `global_batch_size`, is calculated as batch_size × gradient_accumulation_steps × num_gpus.
+> Each step processes `global_batch_size * seq_len` tokens.
+> Monitor the value of `global_batch_size`, `warmup_steps`, and `steps` carefully when modifying any of the hyperparameters!
+For a detailed explanation of all parameters, run:
+```sh
+bash train.sh -h
+```
+<details>
+<summary>Usage</summary>
+```py
+options:
+  -h, --help            show this help message and exit
+  --job.config_file JOB.CONFIG_FILE
+                        Job config file
+  --job.dump_folder JOB.DUMP_FOLDER
+                        Folder to dump job outputs
+  --job.description JOB.DESCRIPTION
+                        Description of the job
+  --job.use_for_integration_test
+                        Add this config to the integration test suite
+  --job.print_args      Print the args to terminal
+  --model.config MODEL.CONFIG
+                        Path to the model config
+  --model.norm_type MODEL.NORM_TYPE
+                        Type of layer normalization to use [layernorm,
+                        np_layernorm, rmsnorm, fused_rmsnorm]
+  --model.tokenizer_path MODEL.TOKENIZER_PATH
+                        Tokenizer path
+  --profiling.enable_profiling
+                        Whether to enable pytorch profiler
+  --profiling.save_traces_folder PROFILING.SAVE_TRACES_FOLDER
+                        Trace files location
+  --profiling.profile_freq PROFILING.PROFILE_FREQ
+                        How often to collect profiler traces, in iterations
+  --profiling.enable_memory_snapshot
+                        Whether to dump memory snapshot
+  --profiling.save_memory_snapshot_folder PROFILING.SAVE_MEMORY_SNAPSHOT_FOLDER
+                        Memeory snapshot files location
+  --optimizer.name OPTIMIZER.NAME
+                        Optimizer to use
+  --optimizer.eps OPTIMIZER.EPS
+                        Epsilon value for the optimizer.
+  --optimizer.fused     Whether the fused implementation(CUDA only) is used.
+  --optimizer.scheduler {wsd,cosine,linear}
+                        Scheduler to use. Currently supported: wsd, cosine,
+                        and linear.
+  --optimizer.lr OPTIMIZER.LR
+                        Learning rate to use
+  --optimizer.min_lr_ratio OPTIMIZER.MIN_LR_RATIO
+                        Min lr ratio for lr scheduler
+  --optimizer.early_step_in_backward
+                        Whether to apply optimizer in the backward. Caution,
+                        optimizer_in_backward is not compatible with gradients
+                        clipping, users should not call
+                        register_post_accumulate_grad_hook after the optimizer
+                        is built.
+  --training.batch_size TRAINING.BATCH_SIZE
+                        Batch size
+  --training.seq_len TRAINING.SEQ_LEN
+                        Sequence length
+  --training.context_len TRAINING.CONTEXT_LEN
+                        Max length allowed for each sequence
+  --training.varlen     Whether to take sequences of variable length as input
+  --training.warmup_steps TRAINING.WARMUP_STEPS
+                        Steps for lr scheduler warmup, normally 1/5 of
+                        --training.steps
+  --training.gradient_accumulation_steps TRAINING.GRADIENT_ACCUMULATION_STEPS
+                        Number of steps to accumulate gradients before
+                        updating parameters
+  --training.steps TRAINING.STEPS
+                        How many train steps to run
+  --training.max_norm TRAINING.MAX_NORM
+                        Max norm for gradient clipping
+  --training.skip_nan_inf
+                        Skip batch updates when NaN or INF gradients are
+                        encountered during training
+  --training.dataset TRAINING.DATASET
+                        Dataset to use, with comma separated values
+  --training.dataset_name TRAINING.DATASET_NAME
+                        The name of the dataset config, with comma separated
+                        values if provided
+  --training.dataset_split TRAINING.DATASET_SPLIT
+                        Dataset split to use, with comma separated values if
+                        provided
+  --training.data_dir TRAINING.DATA_DIR
+                        Data dirs to use, with comma separated values if
+                        provided
+  --training.data_files TRAINING.DATA_FILES
+                        Data files to use, with comma separated values if
+                        provided
+  --training.data_probs TRAINING.DATA_PROBS
+                        Data sampling probabilities, with comma separated
+                        values if provided
+  --training.streaming  Whether to load dataset in streaming mode, used for
+                        huge dataset
+  --training.num_workers TRAINING.NUM_WORKERS
+                        Number of subprocesses to use for data loading. 0
+                        means that the data will be loaded in the main
+                        process.
+  --training.prefetch_factor TRAINING.PREFETCH_FACTOR
+                        Number of batches loaded in advance by each worker.2
+                        means there will be a total of 2 * num_workers batches
+                        prefetched across all workers.
+  --training.data_parallel_replicate_degree TRAINING.DATA_PARALLEL_REPLICATE_DEGREE
+                        The `data_parallel_replicate_degree` argument
+                        specifies the degree of data parallelism for weight
+                        replication. When this value is greater than 1,
+                        weights will be replicated across
+                        `data_parallel_replicate_degree` ranks. If
+                        `data_parallel_shard_degree` is also greater than 1,
+                        the parallelism method used is HSDP (Hybrid Sharded
+                        Data Parallelism). Otherwise, the parallelism method
+                        used is DDP (Distributed Data Parallelism). 1 means
+                        disabled.
+  --training.data_parallel_shard_degree TRAINING.DATA_PARALLEL_SHARD_DEGREE
+                        The `data_parallel_shard_degree` argument specifies
+                        the degree of data parallelism for weight sharding.
+                        When this value is greater than 1, weights will be
+                        sharded across `data_parallel_shard_degree` ranks. If
+                        `data_parallel_replicate_degree` is also greater than
+                        1, the parallelism method used is HSDP (Hybrid Sharded
+                        Data Parallelism). Otherwise, the parallelism method
+                        used is FSDP (Fully Sharded Data Parallelism). -1
+                        means leftover ranks will be used (After
+                        DP_REPLICATE/SP/PP). Note that only
+                        `data_parallel_shard_degree` can be negative. 1 means
+                        disabled.
+  --training.enable_cpu_offload
+                        Whether to apply CPU offloading of parameters,
+                        gradients, and optimizer states in FSDP
+  --training.tensor_parallel_degree TRAINING.TENSOR_PARALLEL_DEGREE
+                        Tensor Parallelism degree. 1 means disabled.
+  --training.disable_loss_parallel
+                        Whether to apply loss parallel when sequence parallel
+                        is enabled
+  --training.mixed_precision_param {bfloat16,float32}
+                        torch dtype to use for parameters when applying mixed
+                        precision via FSDP. This feature only takes effect
+                        when data_parallel_shard_degree > 1
+  --training.mixed_precision_reduce {float32}
+                        torch dtype to use for reductions when applying mixed
+                        precision via FSDP. This feature only takes effect
+                        when data_parallel_shard_degree > 1
+  --training.compile    Whether to compile the model
+  --training.gc_freq TRAINING.GC_FREQ
+                        Python garbage control scheduling interval, in steps
+  --training.seed TRAINING.SEED
+                        Choose the base RNG seed used for training
+  --training.deterministic
+                        Use deterministic algorithms wherever possible, may be
+                        slower
+  --metrics.log_freq METRICS.LOG_FREQ
+                        How often to log metrics to TensorBoard, in iterations
+  --metrics.enable_tensorboard
+                        Whether to log metrics to TensorBoard
+  --metrics.disable_color_printing
+                        Whether to disable color printing in logs
+  --metrics.save_tb_folder METRICS.SAVE_TB_FOLDER
+                        Folder to dump TensorBoard states
+  --metrics.rank_0_only
+                        Whether to save TensorBoard metrics only for rank 0 or
+                        for all ranks. When pipeline_parallel_degree is > 1,
+                        this option uses the 0th rank of the last stage
+                        pipeline group, which is the only stage that computes
+                        loss metrics.
+  --metrics.enable_wandb
+                        Whether to log metrics to Weights & Biases
+  --experimental.enable_async_tensor_parallel
+                        Whether to apply async tensor parallel (currently only
+                        effective when compile is enabled)
+  --experimental.pipeline_parallel_degree EXPERIMENTAL.PIPELINE_PARALLEL_DEGREE
+                        Pipeline Parallelism degree, or number of ranks. 1
+                        means disabled. If using looped schedules, this still
+                        specifies the number of physical ranks, not the number
+                        of stages. Stages per rank are inferred from split
+                        points degree, and schedule.
+  --experimental.pipeline_parallel_split_points EXPERIMENTAL.PIPELINE_PARALLEL_SPLIT_POINTS [EXPERIMENTAL.PIPELINE_PARALLEL_SPLIT_POINTS ...]
+                        Specify comma-separated names of modules to use as the
+                        beginning of a split point. e.g. "layers.0,layers.2"
+                        will cause the model to be split into 3 stages, the
+                        first containing all the layers up to layers.0, the
+                        second containing layers.0 and up to layers.2, the
+                        third containing layers.2 and all the remaining
+                        layers. Note: fully-automated splitting may be enabled
+                        in the future, but currently the split points must be
+                        specified manually.
+  --experimental.pipeline_parallel_schedule EXPERIMENTAL.PIPELINE_PARALLEL_SCHEDULE
+                        Specify the Pipeline Parallel schedule to use. The
+                        supported schedules are: https://github.com/pytorch/py
+                        torch/blob/de4c2a3b4e89d96334dc678d1c3f2ae51a6630a0/to
+                        rch/distributed/pipelining/schedules.py#L2161. The
+                        schedule must be compatible with the split points and
+                        stages_per_rank. Looped schedules (e.g.
+                        Interleaved1F1B) require specifying
+                        pipeline_parallel_degree = number of ranks, and
+                        split_points = number of stages - 1
+  --experimental.pipeline_parallel_schedule_csv EXPERIMENTAL.PIPELINE_PARALLEL_SCHEDULE_CSV
+                        Specify the path to the pipeline parallel schedule csv
+                        file to use. The pipeline_parallel_schedule argument
+                        must be either PipelineScheduleSingle,
+                        PipelineScheduleMulti, or _PipelineScheduleRuntime.
+  --experimental.pipeline_parallel_microbatches EXPERIMENTAL.PIPELINE_PARALLEL_MICROBATCHES
+                        How many microbatches to split the global training
+                        batch into when using pipeline parallelism. The global
+                        training batch size must be evenly divisible by the
+                        number of microbatches. The default value will be the
+                        number of pipeline stages, if unspecified.
+  --experimental.enable_compiled_autograd
+                        Enable CompiledAutograd to compile the backward.
+  --experimental.context_parallel_degree EXPERIMENTAL.CONTEXT_PARALLEL_DEGREE
+                        Context parallelism degree. 1 means disabled.
+  --experimental.context_parallel_rotate_method EXPERIMENTAL.CONTEXT_PARALLEL_ROTATE_METHOD
+                        The collective to use in context parallel SDPA for kv
+                        shards exchange. 'allgather' means to all-gather all
+                        kv shards on ranks after the first sub-SDPA
+                        computation, 'alltoall' means to all-to-all shuffle
+                        the kv shards. The default value is 'allgather'.
+  --checkpoint.enable_checkpoint
+                        Whether to enable checkpoint
+  --checkpoint.folder CHECKPOINT.FOLDER
+                        The folder to store the checkpoints. When
+                        enable_checkpoint is set to true, checkpoints will be
+                        in {--job.dump_folder}/{--checkpoint.folder}.
+  --checkpoint.interval_type CHECKPOINT.INTERVAL_TYPE
+                        Checkpointing interval unit of measurement ['step',
+                        'seconds']
+  --checkpoint.interval CHECKPOINT.INTERVAL
+                        Checkpointing interval, in steps or seconds depending
+                        on --checkpoint.interval_type
+  --checkpoint.model_weights_only
+                        When model_weights_only=True, only model weights will
+                        be saved at the end of training. With this,
+                        checkpoints can be loaded using `torch.load(...,
+                        weights_only=True)` after conversion. When
+                        model_weights_only=False, the full checkpoint will be
+                        saved. A full checkpoint includes model, optimizer and
+                        train_state, which can be used to resume training. The
+                        default value is false.
+  --checkpoint.export_dtype {float16,bfloat16,float32}
+                        Converts to the specified precision when training
+                        completes and model_weights_only=true. Currently
+                        supports float32, float16, and bfloat16. The default
+                        value is float32.
+  --checkpoint.create_seed_checkpoint
+                        Initializes the full model without applying
+                        parallelisms, and then saves it as a seed checkpoint.
+                        Note: requires user to call train.py without
+                        specifying any parallelisms, e.g. NGPU=1. Could be
+                        implemented as a separate script, but this way shares
+                        more code.
+  --checkpoint.async_mode CHECKPOINT.ASYNC_MODE
+                        Which async checkpoint mode to use. Currently there
+                        are 3 different modes. 1. "disabled": synchronized
+                        checkpointing will be used. 2. "async":
+                        torch.distributed.checkpoint.async_save will be used.
+                        1. "async_with_pinned_mem": this option utilizes a
+                        dedicated pinned memory space and creates a separate
+                        process for faster GPU->CPU transfer performance and
+                        eliminating GIL contention. The cost is increased CPU
+                        memory usage. If insufficient CPU memory is available,
+                        performance may degrade due to memory paging. For most
+                        users, "async" should suffice as the performance
+                        overhead is typically small (on the order of tens of
+                        seconds) compared to checkpointing frequency. This
+                        mode can be employed to pursue near-zero checkpointing
+                        times (e.g., < 1 second) given appropriate hardware
+                        support such as ample CPU memory and fast PCIe.
+                        "disabled" is the default mode.
+  --checkpoint.keep_latest_k CHECKPOINT.KEEP_LATEST_K
+                        Keeps only the latest k checkpoints, and purging older
+                        ones. If 0, keep all checkpoints. 0 is the default
+                        value.
+  --checkpoint.load_step CHECKPOINT.LOAD_STEP
+                        Load the checkpoint at the specified step. If -1, load
+                        the latest checkpoint.
+  --float8.enable_float8_linear
+                        If true, swaps `torch.nn.Linear` with `Float8Linear`.
+                        This feature requires you to install 'torchao' which
+                        can be found here: https://github.com/pytorch/ao
+  --float8.enable_fsdp_float8_all_gather
+                        Whether enable float8 all-gather in FSDP
+  --float8.precompute_float8_dynamic_scale_for_fsdp
+                        Whether precompute float8 scales dynamically for FSDP
+  --float8.scaling_type_input {dynamic,delayed}
+                        float8 scaling for input, dynamic (default) or delayed
+  --float8.scaling_type_weight FLOAT8.SCALING_TYPE_WEIGHT
+                        float8 scaling for input, dynamic (default) or delayed
+  --float8.scaling_type_grad_output FLOAT8.SCALING_TYPE_GRAD_OUTPUT
+                        float8 scaling for input, dynamic (default) or delayed
+  --comm.init_timeout_seconds COMM.INIT_TIMEOUT_SECONDS
+                        Timeout for communication operations, during
+                        initialization and first train step.
+  --comm.train_timeout_seconds COMM.TRAIN_TIMEOUT_SECONDS
+                        Timeout for communication operations after the first
+                        train step -- usually a tighter bound than during
+                        initialization.
+  --comm.trace_buf_size COMM.TRACE_BUF_SIZE
+                        Flight recorder ring buffer size, >0 means recording
+                        by default, 0 means disabled
+  --memory_estimation.enabled
+                        Whether to estimate memory usage for FSDP
+  --memory_estimation.disable_fake_mode
+                        Whether to estimate memory under FakeTensorMode
+```
+</details>
+### Training with variable-length inputs
+When you set the `--training.varlen` flag, you're enabling a more efficient training method that packs multiple documents together into a single long sequence, eliminating the need for padding.
+This is particularly useful when your dataset contains documents of varying lengths.
+Let's break down how `--training.seq_len` and `--training.context_len` work in this mode.
+* `--training.seq_len` (Packed Sequence Length): This is the total length of the final sequence fed to the model on one device. Instead of processing one document at a time, the dataloader takes multiple documents (each split to sequences no longer than `context_len`), concatenates them end-to-end, and creates a single long sequence of length `seq_len`.
+* `--training.context_len` (Sample Length): This parameter defines the maximum number of tokens for a single document or sample. If a document from the dataset is longer than `context_len`, it will be truncated. For example, if `--training.context_len` is set to 4,096, a document with 5,000 tokens will be cut down to its first 4,096 tokens, leaving the left tokens as another independent sequence, while a document with 3000 tokens remains unchanged.
+### Training with `torch.compile`
+Starting from `torch 2.0`, `torch.compile` has been introduced as a new feature to seamlessly accelerate training processes.
+In `flame`, one can simply enable `torch.compile` by adding `--training.compile` flag to your training script.
+However, `fla` has integrated numerous fused kernels for acceleration, which may potentially conflict with `torch.compile`.
+We are actively working on resolving these issues to make compilation transparent to users.
+In the meantime, please ensure you are using the latest dependencies.
+Specifically, **we recommend using `torch>=2.6` and `triton>=3.0`**.
+### Training with multiple datasets
+If you wish to train a model with all-round capabilities (e.g., code, math, and multilingual ability), it's necessary to train on multiple datasets.
+`flame` allows training with multiple datasets easily.
+For example, you can specify the following arguments to train on 6 datasets with different proportions:
+```sh
+  --training.dataset HuggingFaceFW/fineweb-edu,opencsg/Fineweb-Edu-Chinese-V2.1,OpenCoder-LLM/opc-fineweb-code-corpus,math-ai/AutoMathText,EleutherAI/proof-pile-2,OpenCoder-LLM/opc-fineweb-math-corpus   \
+  --training.data_probs 0.6,0.15,0.15,0.014,0.058,0.028     \
+```
+### ~Finalizing training~
+> [!NOTE]
+> We have done this conversion automatically in the training script since our latest updates.
+Once training is complete, you may want to convert the distributed checkpoints (DCPs) into the 🤗 format for broader use.
+To facilitate this, we provide a straightforward conversion script:
+```sh
+python -m flame.utils.convert_dcp_to_hf --path <path_to_model> --step <step> --config <path_to_config> --tokenizer <path_to_tokenizer>
+```
+After this, your model will be in the 🤗 format, ready to be shared or deployed.
+You can then easily publish your model using the `huggingface_hub` for wider accessibility.
+### Continual training
+If you wish to build upon a strong pre-trained model (in 🤗 format) and continue training, we also offer a script to convert the 🤗 format model back into DCP format.
+This allows you to seamlessly resume training with `flame`.
+```sh
+python -m flame.utils.convert_hf_to_dcp --model <path_to_hf> --checkpoint <path_to_dcp/checkpoint/step-0>
+```
+Here, `<path_to_dcp>` is the directory where your distributed checkpoints will be stored.
+The checkpoint is intentionally saved at `<step-0>` within the checkpoint folder to ensure it is loadable by `flame` during the initial training step, similar to how a seed checkpoint is handled.
+Once the conversion is complete, you can proceed with training using `flame` as usual, continuing from where the pretrained model left off.
+## Multi-node training
+If you have access to multi-node GPUs, consider leveraging them for optimal performance.
+This process is straightforward and well-documented in the PyTorch [docs](https://pytorch.org/docs/stable/elastic/run.html).
+To set up multi-node training:
+* Set the environment variables `MASTER_ADDR=<ip>` and `MASTER_PORT=<port>` before running the training script across all nodes.
+* If you're using a job scheduler like Slurm, it will handle these variables for you.
+`torchtitan` provides a [Slurm script](https://github.com/pytorch/torchtitan/blob/main/multinode_trainer.slurm) for multi-node training, which you can use as a reference or starting point.
+## Custom models
+`flame` supports custom model architectures through seamless integration with the Hugging Face `transformers` library. To add your own model:
+1. Create a new model directory under `custom_models/` (see `custom_models/sba` for a complete example)
+2. Implement your model classes and configuration:
+   - Define a config class inheriting from `PretrainedConfig` (see `custom_models/sba/config_sba.py` for an example)
+   - Create model classes inheriting from `PreTrainedModel` (see `custom_models/sba/modeling_sba.py` for an example)
+3. Register your models in `__init__.py`:
+   - Import your model classes and config classes
+   - Register your models with the `AutoModelForCausalLM`, `AutoModel` and `AutoConfig` classes (see `custom_models/sba/__init__.py` for an example)
+4. Create a config file for your custom model, just need to specify the `model_type` to the one you just named for your custom model (example: `configs/sba_340m.json`).
+5. Training is extremely simple, you can just use the `flame.train.py` script to train your custom model.
+## Citation
+If you find `flame` helpful for your work, please consider citing it.
+```bib
+@software{yang2025flame,
+  title  = {Flame: Flash Language Modeling Made Easy},
+  author = {Zhang, Yu and Yang, Songlin},
+  url    = {https://github.com/fla-org/flame},
+  month  = jan,
+  year   = {2025}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "architectures": [
+    "GSAForCausalLM"
+  ],
+  "attn": null,
+  "bos_token_id": 151643,
+  "clamp_max": null,
+  "clamp_min": null,
+  "conv_size": 4,
+  "dtype": "bfloat16",
+  "elementwise_affine": false,
+  "eos_token_id": 151645,
+  "expand_k": 1,
+  "expand_v": 1,
+  "feature_map": "swish",
+  "fuse_cross_entropy": true,
+  "fuse_linear_cross_entropy": false,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "gate_logit_normalizer": 8,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 17408,
+  "max_position_embeddings": 40960,
+  "model_type": "gsa",
+  "norm_eps": 1e-06,
+  "num_heads": 40,
+  "num_hidden_layers": 40,
+  "num_kv_heads": 8,
+  "num_slots": 256,
+  "rope_theta": 1000000,
+  "share_conv_kernel": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "use_l2warp": false,
+  "use_norm": true,
+  "use_output_gate": true,
+  "use_rope": false,
+  "use_short_conv": false,
+  "vocab_size": 151936
+}

configs/delta_net_1B.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "attn": null,
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "delta_net",
+    "norm_eps": 1e-06,
+    "num_heads": 16,
+    "num_hidden_layers": 24,
+    "pad_token_id": 2,
+    "qk_activation": "silu",
+    "qk_norm": "l2",
+    "tie_word_embeddings": false,
+    "use_beta": true,
+    "use_cache": true,
+    "use_gate": false,
+    "use_output_norm": true,
+    "use_short_conv": true
+}

configs/delta_net_340M.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "delta_net",
+    "norm_eps": 1e-06,
+    "num_heads": 8,
+    "num_hidden_layers": 24,
+    "qk_activation": "silu",
+    "qk_norm": "l2",
+    "tie_word_embeddings": false,
+    "use_beta": true,
+    "use_cache": true,
+    "use_gate": false,
+    "use_output_norm": true,
+    "use_short_conv": true
+}

configs/gated_deltanet_1B.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_v": 2,
+    "fuse_cross_entropy": true,
+    "head_dim": 256,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "gated_deltanet",
+    "norm_eps": 1e-06,
+    "num_heads": 6,
+    "num_hidden_layers": 21,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_gate": true,
+    "use_short_conv": true
+}

configs/gated_deltanet_340M.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_v": 2,
+    "fuse_cross_entropy": true,
+    "head_dim": 256,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "gated_deltanet",
+    "norm_eps": 1e-06,
+    "num_heads": 6,
+    "num_hidden_layers": 21,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_gate": true,
+    "use_short_conv": true
+}

configs/gay_14B.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "bos_token_id": 151643,
+    "conv_size": 4,
+    "eos_token_id": 151645,
+    "expand_k": 1,
+    "expand_v": 1,
+    "elementwise_affine": false,
+    "feature_map": "swish",
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "gate_logit_normalizer": 8,
+    "hidden_act": "swish",
+    "hidden_size": 5120,
+    "initializer_range": 0.02,
+    "intermediate_size": 17408,
+    "max_position_embeddings": 40960,
+    "model_type": "gsa",
+    "num_heads": 40,
+    "num_hidden_layers": 40,
+    "num_kv_heads": 8,
+    "num_slots": 256,
+    "norm_eps": 1e-06,
+    "share_conv_kernel": true,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_norm": true,
+    "use_output_gate": true,
+    "use_rope": false,
+    "use_short_conv": false,
+    "vocab_size": 151936
+}

configs/gay_1B.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "bos_token_id": 151643,
+    "conv_size": 4,
+    "eos_token_id": 151645,
+    "expand_k": 1,
+    "expand_v": 1,
+    "elementwise_affine": false,
+    "feature_map": "swish",
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "gate_logit_normalizer": 8,
+    "hidden_act": "swish",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 6144,
+    "max_position_embeddings": 40960,
+    "model_type": "gsa",
+    "num_heads": 16,
+    "num_hidden_layers": 28,
+    "num_kv_heads": 8,
+    "num_slots": 256,
+    "norm_eps": 1e-06,
+    "share_conv_kernel": true,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_norm": true,
+    "use_output_gate": true,
+    "use_rope": false,
+    "use_short_conv": false,
+    "vocab_size": 151936
+}

configs/gayted_deltanet_1B.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 151643,
+    "conv_size": 4,
+    "eos_token_id": 151645,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "head_dim": 128,
+    "hidden_act": "swish",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 6144,
+    "max_position_embeddings": 40960,
+    "model_type": "gated_deltanet",
+    "norm_eps": 1e-06,
+    "num_heads": 16,
+    "num_hidden_layers": 28,
+    "num_v_heads": 8,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_gate": true,
+    "use_short_conv": true,
+    "vocab_size": 151936
+}

configs/gla_340M.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "clamp_min": null,
+  "eos_token_id": 2,
+  "expand_k": 0.5,
+  "expand_v": 1,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": null,
+  "model_type": "gla",
+  "num_heads": 4,
+  "num_hidden_layers": 24,
+  "norm_eps": 1e-06,
+  "tie_word_embeddings": false,
+  "use_cache": true,
+  "use_gk": true,
+  "use_gv": false,
+  "vocab_size": 32000
+}

configs/gla_7B.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "attn": null,
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "expand_k": 0.5,
+    "expand_v": 1,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "model_type": "gla",
+    "norm_eps": 1e-06,
+    "num_heads": 16,
+    "num_hidden_layers": 32,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_gk": true,
+    "use_gv": false,
+    "use_output_gate": true,
+    "use_short_conv": false
+}

configs/gsa_1B.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "elementwise_affine": false,
+    "feature_map": "swish",
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "gate_logit_normalizer": 8,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "gsa",
+    "num_heads": 4,
+    "num_hidden_layers": 24,
+    "num_slots": 64,
+    "norm_eps": 1e-06,
+    "share_conv_kernel": true,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_norm": true,
+    "use_output_gate": true,
+    "use_rope": false,
+    "use_short_conv": false
+}

configs/gsa_340M.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "elementwise_affine": false,
+    "feature_map": "swish",
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "gate_logit_normalizer": 4,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "gsa",
+    "num_heads": 4,
+    "num_hidden_layers": 24,
+    "num_slots": 64,
+    "norm_eps": 1e-06,
+    "share_conv_kernel": true,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_norm": true,
+    "use_output_gate": true,
+    "use_rope": false,
+    "use_short_conv": false
+}

configs/hgrn2_340M.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "attn_mode": "chunk",
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "expand_ratio": 128,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "hgrn2",
+    "num_heads": 8,
+    "num_hidden_layers": 24,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "vocab_size": 32000
+}

configs/mamba2_1B.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "norm_eps": 1e-05,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

configs/mamba2_340M.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "bos_token_id": 1,
+  "chunk_size": 256,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "norm_eps": 1e-05,
+  "model_type": "mamba2",
+  "n_groups": 1,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": true,
+  "residual_in_fp32": true,
+  "rms_norm": true,
+  "state_size": 128,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

configs/mamba_1B.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token_id": 1,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "model_type": "mamba",
+  "norm_eps": 1e-05,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": false,
+  "residual_in_fp32": false,
+  "state_size": 16,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_init_scheme": "random",
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "time_step_scale": 1.0,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

configs/mamba_340M.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token_id": 1,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "model_type": "mamba",
+  "norm_eps": 1e-05,
+  "num_hidden_layers": 48,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": false,
+  "residual_in_fp32": false,
+  "state_size": 16,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_init_scheme": "random",
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 128,
+  "time_step_scale": 1.0,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

configs/routmem_1.7B.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "bos_token_id": 151643,
+    "conv_size": 4,
+    "eos_token_id": 151645,
+    "expand_k": 1,
+    "expand_v": 1,
+    "elementwise_affine": false,
+    "feature_map": "swish",
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "gate_logit_normalizer": 8,
+    "hidden_act": "swish",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 6144,
+    "max_position_embeddings": 40960,
+    "model_type": "routmem",
+    "num_heads": 16,
+    "num_hidden_layers": 28,
+    "num_kv_heads": 8,
+    "num_slots": 256,
+    "norm_eps": 1e-06,
+    "share_conv_kernel": true,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_norm": true,
+    "use_output_gate": true,
+    "use_rope": false,
+    "use_short_conv": false,
+    "vocab_size": 151936,
+    "add_gumbel_noise": true,
+    "router_score": "sigmoid",
+    "router_type": "lin"
+}

configs/routmem_14B.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "bos_token_id": 151643,
+    "conv_size": 4,
+    "eos_token_id": 151645,
+    "expand_k": 1,
+    "expand_v": 1,
+    "elementwise_affine": false,
+    "feature_map": "swish",
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "gate_logit_normalizer": 8,
+    "hidden_act": "swish",
+    "hidden_size": 5120,
+    "initializer_range": 0.02,
+    "intermediate_size": 17408,
+    "max_position_embeddings": 40960,
+    "model_type": "routmem",
+    "num_heads": 40,
+    "num_hidden_layers": 40,
+    "num_kv_heads": 8,
+    "num_slots": 256,
+    "norm_eps": 1e-06,
+    "share_conv_kernel": true,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_norm": true,
+    "use_output_gate": true,
+    "use_rope": false,
+    "use_short_conv": false,
+    "vocab_size": 151936,
+    "add_gumbel_noise": true,
+    "router_score": "sigmoid",
+    "router_type": "lin"
+}

configs/routmem_340M.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "bos_token_id": 1,
+    "conv_size": 4,
+    "eos_token_id": 2,
+    "expand_k": 1,
+    "expand_v": 1,
+    "elementwise_affine": false,
+    "feature_map": "swish",
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "gate_logit_normalizer": 4,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "model_type": "routmem",
+    "num_heads": 4,
+    "num_hidden_layers": 24,
+    "num_slots": 256,
+    "norm_eps": 1e-06,
+    "share_conv_kernel": true,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "use_norm": true,
+    "use_output_gate": true,
+    "use_rope": false,
+    "use_short_conv": false,
+    "bias_rmm": false,
+    "add_gumbel_noise": true,
+    "router_score": "sigmoid",
+    "router_type": "lin"
+  }

configs/samba_1B.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "attn": {
+    "layers": [
+      1,
+      3,
+      5,
+      7,
+      9,
+      11,
+      13,
+      15,
+      17
+    ],
+    "num_heads": 18,
+    "num_kv_heads": 18,
+    "qkv_bias": false,
+    "rope_theta": 10000.0,
+    "window_size": 2048
+  },
+  "bos_token_id": 1,
+  "conv_kernel": 4,
+  "eos_token_id": 2,
+  "expand": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 2304,
+  "initializer_range": 0.02,
+  "intermediate_size": 4608,
+  "max_position_embeddings": 2048,
+  "model_type": "samba",
+  "norm_eps": 1e-05,
+  "num_hidden_layers": 18,
+  "pad_token_id": 0,
+  "rescale_prenorm_residual": false,
+  "residual_in_fp32": false,
+  "state_size": 16,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_init_scheme": "random",
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "time_step_rank": 144,
+  "time_step_scale": 1.0,
+  "transformers_version": "4.50.1",
+  "use_bias": false,
+  "use_cache": true,
+  "use_conv_bias": true,
+  "vocab_size": 32000
+}

configs/sba_340m.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_size": 1024,
+    "initializer_range": 0.006,
+    "max_position_embeddings": 8192,
+    "model_type": "sba",
+    "num_heads": 16,
+    "num_hidden_layers": 24,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "vocab_size": 32000
+}

configs/transformer_1B.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "bos_token_id": 1,
+    "elementwise_affine": true,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "fuse_swiglu": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": null,
+    "max_position_embeddings": 8192,
+    "model_type": "transformer",
+    "norm_eps": 1e-06,
+    "num_heads": 32,
+    "num_hidden_layers": 24,
+    "num_kv_heads": null,
+    "pad_token_id": 2,
+    "rope_theta": 10000.0,
+    "tie_word_embeddings": false
+}

configs/transformer_340M.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "max_position_embeddings": 8192,
+    "model_type": "transformer",
+    "num_heads": 16,
+    "num_hidden_layers": 24,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "vocab_size": 32000
+}

configs/transformer_7B.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_ratio": 4,
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 14336,
+    "model_type": "transformer",
+    "norm_eps": 1e-06,
+    "num_heads": 32,
+    "num_hidden_layers": 32,
+    "num_kv_heads": 8,
+    "rope_theta": 10000.0,
+    "tie_word_embeddings": false,
+    "use_cache": true,
+    "window_size": null
+}

flame/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.1.0"

flame/components/__init__.py ADDED Viewed

File without changes

flame/components/checkpoint.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass, field
+from datetime import timedelta
+from io import BytesIO
+from typing import Any, Dict, List
+import torch
+from torch.distributed.checkpoint.stateful import Stateful
+@dataclass
+class TrainState(Stateful):
+    step: int = 0
+    skipped_step: int = 0
+    token: int = 0
+    elapsed: timedelta = timedelta(0)
+    global_avg_losses: List[float] = field(default_factory=list)
+    global_max_losses: List[float] = field(default_factory=list)
+    log_steps: List[int] = field(default_factory=list)
+    def state_dict(self) -> Dict[str, Any]:
+        # Only checkpoint global_avg_losses and global_max_losses per log frequency
+        # to avoid sync overhead in every iteration.
+        global_avg_losses_bytes = BytesIO()
+        torch.save(self.global_avg_losses, global_avg_losses_bytes)
+        global_max_losses_bytes = BytesIO()
+        torch.save(self.global_max_losses, global_max_losses_bytes)
+        log_steps_bytes = BytesIO()
+        torch.save(self.log_steps, log_steps_bytes)
+        return {
+            "step": torch.tensor(self.step, dtype=torch.int32),
+            "skipped_step": torch.tensor(self.skipped_step, dtype=torch.int32),
+            "token": torch.tensor(self.token, dtype=torch.int64),
+            "elapsed": self.elapsed,
+            "global_avg_losses": global_avg_losses_bytes,
+            "global_max_losses": global_max_losses_bytes,
+            "log_steps": log_steps_bytes,
+        }
+    def load_state_dict(self, state_dict) -> None:
+        self.step = state_dict["step"].item()
+        self.skipped_step = state_dict.get("skipped_step", 0).item()
+        self.token = state_dict["token"].item()
+        self.elapsed = state_dict["elapsed"]
+        state_dict["global_avg_losses"].seek(0)
+        self.global_avg_losses = torch.load(
+            state_dict["global_avg_losses"], weights_only=False
+        )
+        state_dict["global_max_losses"].seek(0)
+        self.global_max_losses = torch.load(
+            state_dict["global_max_losses"], weights_only=False
+        )
+        state_dict["log_steps"].seek(0)
+        self.log_steps = torch.load(state_dict["log_steps"], weights_only=False)

flame/config_manager.py ADDED Viewed

	@@ -0,0 +1,960 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import sys
+from collections import defaultdict
+from typing import Tuple
+import torch
+try:
+    import tomllib
+except ModuleNotFoundError:
+    import tomli as tomllib
+from torchtitan.tools.logging import logger
+TORCH_DTYPE_MAP = {
+    "float16": torch.float16,
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+}
+def string_list(raw_arg):
+    """Comma-separated string list argument."""
+    return [s.strip() for s in raw_arg.split(",") if s.strip()]
+def check_string_list_argument(args_dict: dict[str, any], fullargname: str):
+    section, name = fullargname.split(".")
+    # Split string list which are still raw strings.
+    if (
+        section in args_dict
+        and name in args_dict[section]
+        and isinstance(args_dict[section][name], str)
+    ):
+        sec = args_dict[section]
+        sec[name] = string_list(sec[name])
+class JobConfig:
+    """
+    A helper class to manage the train configuration.
+    Semantics:
+    - Default config is loaded from a toml file. If no toml file is provided,
+    then the default config is loaded from argparse defaults.
+    - if toml file has missing keys, they are filled with argparse defaults.
+    - if additional explicit cmd args are provided in addition to the toml
+    file, they will override the toml config and the argparse defaults
+    precedence order: cmdline > toml > argparse default
+    Arg parsing semantics:
+    Each argument starts with <prefix>_ which is the section name in the toml file
+    followed by name of the option in the toml file. For ex,
+    model.name translates to:
+        [model]
+        name
+    in the toml file
+    """
+    def __init__(self):
+        self.args_dict = None
+        # main parser
+        self.parser = argparse.ArgumentParser(description="torchtitan arg parser.")
+        self.parser.add_argument(
+            "--job.config_file",
+            type=str,
+            default=None,
+            help="Job config file",
+        )
+        # job level configs
+        self.parser.add_argument(
+            "--job.dump_folder",
+            type=str,
+            default="./torchtitan/outputs",
+            help="Folder to dump job outputs",
+        )
+        self.parser.add_argument(
+            "--job.description",
+            type=str,
+            default="default job",
+            help="Description of the job",
+        )
+        self.parser.add_argument(
+            "--job.use_for_integration_test",
+            action="store_true",
+            help="Add this config to the integration test suite",
+        )
+        self.parser.add_argument(
+            "--job.print_args",
+            action="store_true",
+            help="Print the args to terminal",
+        )
+        # model configs
+        self.parser.add_argument(
+            "--model.name",
+            type=str,
+            default="fla",
+            help="Which model to train",
+        )
+        self.parser.add_argument(
+            "--model.config",
+            type=str,
+            default="fla-hub/transformer-1.3B-100B",
+            help="Path to the model config",
+        )
+        self.parser.add_argument(
+            "--model.tokenizer_path",
+            type=str,
+            default="fla-hub/transformer-1.3B-100B",
+            help="Tokenizer path",
+        )
+        self.parser.add_argument(
+            "--model.converters",
+            type=string_list,
+            nargs="+",
+            default=[],
+            help="""
+                Comma separated list of converters to apply to the model.
+                For instance, the `float8` converter swaps `torch.nn.Linear`
+                with `Float8Linear`. This feature requires you to install 'torchao'
+                which can be found here: https://github.com/pytorch/ao
+            """,
+        )
+        self.parser.add_argument(
+            "--model.print_after_conversion",
+            action="store_true",
+            help="""
+            If true, model definition will be printed to stdout after all model
+            converters have been applied.
+            """,
+        )
+        # profiling configs
+        self.parser.add_argument(
+            "--profiling.enable_profiling",
+            action="store_true",
+            help="Whether to enable pytorch profiler",
+        )
+        self.parser.add_argument(
+            "--profiling.save_traces_folder",
+            type=str,
+            default="profile_traces",
+            help="Trace files location",
+        )
+        self.parser.add_argument(
+            "--profiling.profile_freq",
+            type=int,
+            default=10,
+            help="How often to collect profiler traces, in iterations",
+        )
+        self.parser.add_argument(
+            "--profiling.enable_memory_snapshot",
+            action="store_true",
+            help="Whether to dump memory snapshot",
+        )
+        self.parser.add_argument(
+            "--profiling.save_memory_snapshot_folder",
+            type=str,
+            default="memory_snapshot",
+            help="Memeory snapshot files location",
+        )
+        # optimizer configs
+        self.parser.add_argument(
+            "--optimizer.name", type=str, default="AdamW", help="Optimizer to use"
+        )
+        self.parser.add_argument(
+            "--optimizer.eps",
+            type=float,
+            default=1e-8,
+            help="Epsilon value for the optimizer.",
+        )
+        self.parser.add_argument(
+            "--optimizer.lr", type=float, default=8e-4, help="Learning rate to use"
+        )
+        self.parser.add_argument(
+            "--optimizer.beta1", type=float, default=0.9,
+            help="Exponential moving average hyperparameters to use"
+        )
+        self.parser.add_argument(
+            "--optimizer.beta2", type=float, default=0.95,
+            help="Exponential moving average hyperparameters to use"
+        )
+        self.parser.add_argument(
+            "--optimizer.weight_decay", type=float, default=0.1,
+            help="Weight decay to use"
+        )
+        self.parser.add_argument(
+            "--optimizer.implementation",
+            type=str,
+            default="fused",
+            choices=["for-loop", "foreach", "fused"],
+            help="""
+            Specify which optimizer implementation to use:
+            - 'fused': Use fused implementation (CUDA only) for best performance.
+            - 'foreach': Use some horizontal fusion of tensors for better performance.
+            - 'for-loop': Use the default implementation for the optimizer (slowest).
+            - more info: https://pytorch.org/docs/stable/optim.html
+            """,
+        )
+        self.parser.add_argument(
+            "--optimizer.early_step_in_backward",
+            action="store_true",
+            help="""
+            Whether to apply optimizer in the backward. Caution, optimizer_in_backward
+            is not compatible with gradients clipping, users should not call
+            register_post_accumulate_grad_hook after the optimizer is built.""",
+        )
+        # lr scheduler configs
+        self.parser.add_argument(
+            "--lr_scheduler.warmup_steps",
+            type=int,
+            default=200,
+            help="Steps for lr scheduler warmup, normally 1/5 of --training.steps",
+        )
+        self.parser.add_argument(
+            "--lr_scheduler.decay_ratio",
+            type=float,
+            default=None,
+            help="""
+            Controls the proportion of the training steps allocated to the learning rate decay phase.
+            If `None`, the learning rate will begin decaying immediately after the warmup period.
+            Otherwise, the learning rate will remain stable after the warmup period and
+            only start decaying during the last `decay_ratio` portion of the total training steps.
+            This is known as the Warmup-Stable-Decay (WSD) schedule, as described in https://arxiv.org/abs/2404.06395.
+            """,
+        )
+        self.parser.add_argument(
+            "--lr_scheduler.decay_type",
+            type=str,
+            default="linear",
+            choices=["linear", "sqrt", "cosine"],
+            help="""
+            Learning rate decay type to use during training:
+            - 'linear': linearly decays learning rate from initial to final value
+            - 'sqrt': decays learning rate following a 1 minus square root curve
+            - 'cosine': smoothly decays learning rate following a cosine curve
+            """,
+        )
+        self.parser.add_argument(
+            "--lr_scheduler.lr_min",
+            type=float,
+            default=0.0,
+            help="""
+            Min lr ratio for lr scheduler.
+            If provided, the range of decay factor is scaled from 1 to `lr_min`
+            to ensure the learning rate does not drop below `optimizer.lr * lr_scheduler.lr_min`.
+            """,
+        )
+        # training configs
+        self.parser.add_argument(
+            "--training.batch_size", type=int, default=8, help="Batch size"
+        )
+        self.parser.add_argument(
+            "--training.seq_len", type=int, default=2048, help="Sequence length"
+        )
+        self.parser.add_argument(
+            "--training.context_len",
+            type=int,
+            default=2048,
+            help="Max length allowed for each sequence",
+        )
+        self.parser.add_argument(
+            "--training.varlen",
+            action="store_true",
+            help="Whether to take sequences of variable length as input",
+        )
+        self.parser.add_argument(
+            "--training.gradient_accumulation_steps",
+            type=int,
+            default=1,
+            help="Number of steps to accumulate gradients before updating parameters",
+        )
+        self.parser.add_argument(
+            "--training.steps",
+            type=int,
+            default=10000,
+            help="How many train steps to run",
+        )
+        self.parser.add_argument(
+            "--training.max_norm",
+            type=float,
+            default=1.0,
+            help="Max norm for gradient clipping",
+        )
+        self.parser.add_argument(
+            "--training.skip_nan_inf",
+            action="store_true",
+            help="Skip batch updates when NaN or INF gradients are encountered during training",
+        )
+        self.parser.add_argument(
+            "--training.dataset",
+            default="HuggingFaceFW/fineweb-edu",
+            help="Dataset to use, with comma separated values",
+        )
+        self.parser.add_argument(
+            "--training.dataset_name",
+            default=None,
+            help="The name of the dataset config, with comma separated values if provided",
+        )
+        self.parser.add_argument(
+            "--training.dataset_split",
+            default=None,
+            help="Dataset split to use, with comma separated values if provided",
+        )
+        self.parser.add_argument(
+            "--training.data_dir",
+            default=None,
+            help="Data dirs to use, with comma separated values if provided",
+        )
+        self.parser.add_argument(
+            "--training.data_files",
+            default=None,
+            help="Data files to use, with comma separated values if provided",
+        )
+        self.parser.add_argument(
+            "--training.data_probs",
+            default=None,
+            help="Data sampling probabilities, with comma separated values if provided",
+        )
+        self.parser.add_argument(
+            "--training.streaming",
+            action="store_true",
+            help="Whether to load dataset in streaming mode, used for huge dataset",
+        )
+        self.parser.add_argument(
+            "--training.num_workers",
+            type=int,
+            default=32,
+            help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.",
+        )
+        self.parser.add_argument(
+            "--training.prefetch_factor",
+            type=int,
+            default=2,
+            help="Number of batches loaded in advance by each worker."
+            "2 means there will be a total of 2 * num_workers batches prefetched across all workers.",
+        )
+        self.parser.add_argument(
+            "--training.data_parallel_replicate_degree",
+            type=int,
+            default=1,
+            help="""
+            The `data_parallel_replicate_degree` argument specifies the degree of
+            data parallelism for weight replication. When this value is greater
+            than 1, weights will be replicated across `data_parallel_replicate_degree`
+            ranks. If `data_parallel_shard_degree` is also greater than 1, the parallelism
+            method used is HSDP (Hybrid Sharded Data Parallelism). Otherwise, the
+            parallelism method used is DDP (Distributed Data Parallelism).
+            1 means disabled.""",
+        )
+        self.parser.add_argument(
+            "--training.data_parallel_shard_degree",
+            type=int,
+            default=-1,
+            help="""
+            The `data_parallel_shard_degree` argument specifies the degree of data
+            parallelism for weight sharding. When this value is greater than 1, weights
+            will be sharded across `data_parallel_shard_degree` ranks. If
+            `data_parallel_replicate_degree` is also greater than 1, the parallelism
+            method used is HSDP (Hybrid Sharded Data Parallelism).  Otherwise, the
+            parallelism method used is FSDP (Fully Sharded Data Parallelism).
+            -1 means leftover ranks will be used (After DP_REPLICATE/SP/PP). Note that
+            only `data_parallel_shard_degree` can be negative. 1 means disabled.""",
+        )
+        self.parser.add_argument(
+            "--training.enable_cpu_offload",
+            action="store_true",
+            help="""
+            Whether to apply CPU offloading of parameters, gradients, and optimizer states in FSDP""",
+        )
+        self.parser.add_argument(
+            "--training.tensor_parallel_degree",
+            type=int,
+            default=1,
+            help="Tensor Parallelism degree. 1 means disabled.",
+        )
+        self.parser.add_argument(
+            "--training.disable_loss_parallel",
+            action="store_true",
+            help="Whether to apply loss parallel when sequence parallel is enabled",
+        )
+        self.parser.add_argument(
+            "--training.fsdp_reshard_after_forward",
+            type=str,
+            default="default",
+            choices=["default", "always", "never"],
+            help="""
+            `reshard_after_forward` specifies the policy for applying `reshard_after_forward`
+            within an FSDP setup. `reshard_after_forward` controls parameter behavior after forward,
+            trading off memory and communication. See torch's `fully_shard` API for more documentation
+            on `reshard_after_forward`.
+            The supported policies include "default", "always" and "never":
+            - "default" applies default resharding behavior, implementing "smart defaults" for known optimal
+              scenarios.
+            - "always" will enable `reshard_after_forward` for all forward passes.
+            - "never" will disable `reshard_after_forward` for all forward passes.
+            """,
+        )
+        self.parser.add_argument(
+            "--training.mixed_precision_param",
+            type=str,
+            default="bfloat16",
+            choices=["bfloat16", "float32"],
+            help="""
+                torch dtype to use for parameters when applying mixed precision via fully_shard or torch.autocast.
+                This feature takes effect via fully_shard when data_parallel_shard_degree > 1 or
+                context_parallel_degree > 1; it takes effect via torch.autocast when data_replicate_degree >= 1
+                and no other parallelism is enabled, i.e. under DDP or single-device training.
+            """,
+        )
+        self.parser.add_argument(
+            "--training.mixed_precision_reduce",
+            type=str,
+            default="float32",
+            choices=["float32"],
+            help="""
+                torch dtype to use for reductions when applying mixed precision via FSDP.
+                This feature only takes effect when data_parallel_shard_degree > 1
+            """,
+        )
+        self.parser.add_argument(
+            "--training.compile",
+            action="store_true",
+            help="Whether to compile the model",
+        )
+        self.parser.add_argument(
+            "--training.gc_freq",
+            type=int,
+            default=50,
+            help="Python garbage control scheduling interval, in steps",
+        )
+        self.parser.add_argument(
+            "--training.seed",
+            type=int,
+            default=42,
+            help="Choose the base RNG seed used for training",
+        )
+        self.parser.add_argument(
+            "--training.deterministic",
+            action="store_true",
+            help="Use deterministic algorithms wherever possible, may be slower",
+        )
+        # metrics configs
+        self.parser.add_argument(
+            "--metrics.log_freq",
+            type=int,
+            default=10,
+            help="How often to log metrics to TensorBoard, in iterations",
+        )
+        self.parser.add_argument(
+            "--metrics.enable_tensorboard",
+            action="store_true",
+            help="Whether to log metrics to TensorBoard",
+        )
+        self.parser.add_argument(
+            "--metrics.disable_color_printing",
+            action="store_true",
+            help="Whether to disable color printing in logs",
+        )
+        self.parser.add_argument(
+            "--metrics.save_tb_folder",
+            type=str,
+            default="tb",
+            help="Folder to dump TensorBoard states",
+        )
+        self.parser.add_argument(
+            "--metrics.save_for_all_ranks",
+            action="store_true",
+            default=False,
+            help="""
+                Whether to save TensorBoard/Wandb metrics only for rank 0 or for all ranks.
+                When this option is False and pipeline_parallel_degree is > 1, the metrics
+                component uses the 0th rank of the last stage pipeline group, which is the
+                only stage that computes loss metrics.
+            """,
+        )
+        self.parser.add_argument(
+            "--metrics.enable_wandb",
+            action="store_true",
+            help="Whether to log metrics to Weights & Biases",
+        )
+        self.parser.add_argument(
+            "--experimental.enable_async_tensor_parallel",
+            action="store_true",
+            help="Whether to apply async tensor parallel (currently only effective when compile is enabled)",
+        )
+        self.parser.add_argument(
+            "--experimental.pipeline_parallel_degree",
+            type=int,
+            default=1,
+            help="""
+                Pipeline Parallelism degree, or number of ranks. 1 means disabled.
+                If using looped schedules, this still specifies the number of physical ranks, not the number
+                of stages.  Stages per rank are inferred from split points degree, and schedule.""",
+        )
+        self.parser.add_argument(
+            "--experimental.pipeline_parallel_split_points",
+            type=string_list,
+            nargs="+",
+            default=[],
+            help="""
+                Specify comma-separated names of modules to use as the beginning of a split point.
+                e.g. "layers.0,layers.2" will cause the model to be split into 3 stages,
+                the first containing all the layers up to layers.0,
+                the second containing layers.0 and up to layers.2,
+                the third containing layers.2 and all the remaining layers.
+                Note: fully-automated splitting may be enabled in the future,
+                but currently the split points must be specified manually.""",
+        )
+        self.parser.add_argument(
+            "--experimental.pipeline_parallel_schedule",
+            type=str,
+            default="1F1B",
+            help="""
+                Specify the Pipeline Parallel schedule to use. The supported schedules are:
+                https://github.com/pytorch/pytorch/blob/de4c2a3b4e89d96334dc678d1c3f2ae51a6630a0/torch/distributed/pipelining/schedules.py#L2161.
+                The schedule must be compatible with the split points and stages_per_rank.
+                Looped schedules (e.g. Interleaved1F1B) require specifying pipeline_parallel_degree = number of ranks,
+                and split_points = number of stages - 1
+                """,
+        )
+        self.parser.add_argument(
+            "--experimental.pipeline_parallel_schedule_csv",
+            type=str,
+            default="",
+            help="""
+                Specify the path to the pipeline parallel schedule csv file to use.
+                The pipeline_parallel_schedule argument must be either
+                PipelineScheduleSingle, PipelineScheduleMulti, or _PipelineScheduleRuntime.
+            """,
+        )
+        self.parser.add_argument(
+            "--experimental.pipeline_parallel_microbatches",
+            type=int,
+            default=None,
+            help="""
+                How many microbatches to split the global training batch into when using pipeline parallelism.
+                The global training batch size must be evenly divisible by the number of microbatches.
+                The default value will be the number of pipeline stages, if unspecified.
+            """,
+        )
+        self.parser.add_argument(
+            "--experimental.enable_compiled_autograd",
+            action="store_true",
+            help="Enable CompiledAutograd to compile the backward.",
+        )
+        self.parser.add_argument(
+            "--experimental.context_parallel_degree",
+            type=int,
+            default=1,
+            help="Context parallelism degree. 1 means disabled.",
+        )
+        self.parser.add_argument(
+            "--experimental.context_parallel_rotate_method",
+            type=str,
+            default="allgather",
+            help="""
+                The collective to use in context parallel SDPA for kv shards exchange.
+                'allgather' means to all-gather all kv shards on ranks after the first sub-SDPA computation,
+                'alltoall' means to all-to-all shuffle the kv shards.
+                The default value is 'allgather'.
+            """,
+        )
+        # I'm not particularly fond of this. Users can choose to write their own wrapper
+        # module and import TorchTitan training loop and execute it, which look cleaner.
+        # One reason to provide this option is to allow users to use the existing run script.
+        # While the script is pretty trivial now, we may add more logic when integrating
+        # with TorchFT.
+        # This option is subject to change and may be deleted in the future.
+        self.parser.add_argument(
+            "--experimental.custom_model_path",
+            type=str,
+            default="",
+            help="""
+                The --custom_model_path option allows to specify a custom path to a model module
+                that is not natively implemented within TorchTitan.
+                Acceptable values are the file system path to the module (e.g., my_models/model_x)
+                dotted import module  (e.g., some_package.model_x).
+            """,
+        )
+        # checkpointing configs
+        self.parser.add_argument(
+            "--checkpoint.enable_checkpoint",
+            action="store_true",
+            help="Whether to enable checkpoint",
+        )
+        self.parser.add_argument(
+            "--checkpoint.folder",
+            type=str,
+            default="checkpoint",
+            help="""
+                The folder to store the checkpoints.
+                When enable_checkpoint is set to true, checkpoints will be in {--job.dump_folder}/{--checkpoint.folder}.
+            """,
+        )
+        self.parser.add_argument(
+            "--checkpoint.initial_load_path", type=str, default=None,
+            help="""
+                This option specifies the path to the initial checkpoint to load, which is
+                particularly useful for resuming training from a previous run with a
+                different output path or when loading a checkpoint from a pre-trained model.
+                If the checkpoint folder for the current run is not empty,
+                located at {--job.dump_folder}/{--checkpoint.folder}, this option will be ignored.
+                This feature allows users to load an initial checkpoint from a different folder and
+                continue training, saving new checkpoints to the specified folder without affecting
+                the existing ones.
+                Note that the path should contain the full path to the checkpoint folder,
+                including the step number, if any; for example,
+                "//pre_train/checkpoints/llama3/llama3_8b/step_10000".
+                """
+        )
+        self.parser.add_argument(
+            "--checkpoint.initial_load_model_weights_only",
+            dest='checkpoint.initial_load_model_weights_only', action="store_true", default=True,
+            help="""
+                This option specifies if only the model weights should be loaded during the initial
+                checkpoint load. The option is only used when `initial_load_path` is specified, and
+                only applies to a model_weights_only checkpoint. Loading a periodic checkpoint
+                may lead to unexpected behavior if this option is set to True.
+                If False, the checkpoint at `initial_load_path` is treated as a standard training
+                checkpoint, including optimizer and training states.
+                The default setting for this option is True. Note that you will have to use
+                `--checkpoint.no_initial_load_model_weights_only` to override the default setting.
+            """
+        )
+        self.parser.add_argument(
+            "--checkpoint.no_initial_load_model_weights_only",
+            dest='checkpoint.initial_load_model_weights_only', action="store_false",
+        )
+        self.parser.add_argument(
+            "--checkpoint.interval",
+            type=int,
+            default=500,
+            help="Checkpointing interval in steps.",
+        )
+        self.parser.add_argument(
+            "--checkpoint.last_save_model_weights_only",
+            action="store_true",
+            help="""
+                When last_save_model_weights_only=True, only model weights will be saved at the end of training,
+                the last save.  With this, checkpoints can be loaded using `torch.load(..., weights_only=True)`
+                after conversion.  When last_save_model_weights_only=False, the full checkpoint will be saved.
+                A full checkpoint includes model, optimizer and train_state, which can be used to resume training.
+                The default value is false.
+            """,
+        )
+        self.parser.add_argument(
+            "--checkpoint.export_dtype",
+            type=str,
+            default="float32",
+            choices=["float16", "bfloat16", "float32"],
+            help="""
+                Converts to the specified precision when training completes and model_weights_only=true.
+                Currently supports float32, float16, and bfloat16.
+                The default value is float32.
+            """,
+        )
+        self.parser.add_argument(
+            "--checkpoint.create_seed_checkpoint",
+            action="store_true",
+            help="""
+                Initializes the full model without applying parallelisms, and then saves it as a seed checkpoint.
+                Note: requires user to call train.py without specifying any parallelisms, e.g. NGPU=1.
+                Could be implemented as a separate script, but this way shares more code.
+            """,
+        )
+        self.parser.add_argument(
+            "--checkpoint.async_mode",
+            type=str,
+            default="disabled",
+            help="""
+                Which async checkpoint mode to use. Currently there are 3 different modes.
+                1. "disabled": synchronized checkpointing will be used.
+                2. "async": torch.distributed.checkpoint.async_save will be used.
+                3. "async_with_pinned_mem": this option utilizes a dedicated pinned memory
+                   space and creates a separate process for faster GPU->CPU transfer
+                   performance and eliminating GIL contention. The cost is increased CPU
+                   memory usage. If insufficient CPU memory is available, performance may
+                   degrade due to memory paging. For most users, "async" should suffice as
+                   the performance overhead is typically small (on the order of tens of
+                   seconds) compared to checkpointing frequency. This mode can be employed
+                   to pursue near-zero checkpointing times (e.g., < 1 second) given
+                   appropriate hardware support such as ample CPU memory and fast PCIe.
+                "disabled" is the default mode.
+            """,
+        )
+        self.parser.add_argument(
+            "--checkpoint.keep_latest_k",
+            type=int,
+            default=0,
+            help="""
+                Keeps only the latest k checkpoints, and purging older ones. If 0, keep all checkpoints.
+                0 is the default value. k cannot be 1 as the last one may be in the process of being
+                saved. As a result, the metadata of the last one may not be ready yet.
+            """,
+        )
+        self.parser.add_argument(
+            "--checkpoint.load_step",
+            type=int,
+            default=-1,
+            help="Load the checkpoint at the specified step. If -1, load the latest checkpoint.",
+        )
+        self.parser.add_argument(
+            "--checkpoint.exclude_from_loading",
+            type=string_list,
+            nargs="*",
+            default=[],
+            help="""
+                Exclude specific keys from being loaded from the checkpoint.
+                Provide a comma-separated list of keys to exclude, e.g. 'optimizer,lr_scheduler,dataloader'.
+                This will load the model only, excluding the specified keys.
+            """,
+        )
+        # activation checkpointing configs
+        self.parser.add_argument(
+            "--activation_checkpoint.mode",
+            type=str,
+            default="selective",
+            help="Type of activation checkpointing to use ['none', 'full', 'selective']",
+        )
+        self.parser.add_argument(
+            "--activation_checkpoint.selective_ac_option",
+            type=str,
+            default="2",  # 2 = checkpoint every other layer
+            help="""
+                Selective activation checkpointing options ['int', 'op'].
+                'int' (e.g., 2) for every nth layer, or 'op' for op level ac.
+            """,
+        )
+        self.parser.add_argument(
+            "--activation_offload.mode",
+            type=str,
+            default="none",
+            help="""
+                if we are using activation offload or not. Options are ['none', 'full'].
+            """,
+        )
+        # float8 configs
+        self.parser.add_argument(
+            "--float8.enable_fsdp_float8_all_gather",
+            action="store_true",
+            help="Whether enable float8 all-gather in FSDP, recommended for tensorwise scaling",
+        )
+        self.parser.add_argument(
+            "--float8.precompute_float8_dynamic_scale_for_fsdp",
+            action="store_true",
+            help="Whether precompute float8 scales dynamically for FSDP, recommended for tensorwise scaling",
+        )
+        self.parser.add_argument(
+            "--float8.force_recompute_fp8_weight_in_bwd",
+            action="store_true",
+            help="""
+            Whether to force the recomputation of FP8 weights during backward pass.
+            When using FSDP with tensorwise scaling, it is recommended to enable
+            `force_recompute_fp8_weight_in_bwd` to prevent saving unsharded FP8 weights
+            for backward computation.
+            """,
+        )
+        self.parser.add_argument(
+            "--float8.recipe_name",
+            type=str,
+            default=None,
+            choices=["tensorwise", "rowwise", "rowwise_with_gw_hp"],
+            help="""
+            If specified, creates float8 config from recipe name, valid choices are
+            `tensorwise`, `rowwise` and `rowwise_with_gw_hp`.
+            """,
+        )
+        # communications library settings
+        self.parser.add_argument(
+            "--comm.init_timeout_seconds",
+            type=int,
+            default=300,
+            help="Timeout for communication operations, during initialization and first train step.",
+        )
+        self.parser.add_argument(
+            "--comm.train_timeout_seconds",
+            type=int,
+            default=100,
+            help=(
+                "Timeout for communication operations after the first train step -- "
+                "usually a tighter bound than during initialization."
+            ),
+        )
+        self.parser.add_argument(
+            "--comm.trace_buf_size",
+            type=int,
+            default=20000,
+            help="Flight recorder ring buffer size, >0 means recording by default, 0 means disabled",
+        )
+        # memory estimation settings
+        self.parser.add_argument(
+            "--memory_estimation.enabled",
+            help="Whether to estimate memory usage for FSDP",
+            action="store_true",
+        )
+        self.parser.add_argument(
+            "--memory_estimation.disable_fake_mode",
+            help="Whether to estimate memory under FakeTensorMode",
+            action="store_true",
+        )
+        self.parser.add_argument(
+            "--fault_tolerance.enable",
+            action="store_true",
+            help="""
+                Enable TorchFT integration. When TorchFT is enabled, HSDP will be used.
+                And --fault_tolerance.data_parallel_replicate_degree should be 1 and
+                --fault_tolerance.group_size will be used to control the maximum
+                replicate group size as the replicate group size is dynamic.
+                Note that this is still an experimental feature.
+            """,
+        )
+        self.parser.add_argument(
+            "--fault_tolerance.replica_id",
+            type=int,
+            default=0,
+            help="The TorchFT replica ID of this run.",
+        )
+        self.parser.add_argument(
+            "--fault_tolerance.group_size",
+            type=int,
+            default=0,
+            help="""
+                The number of TorchFT replicate groups. This number will be used for
+                dataloader to split the dataset across the replicate groups and FSDP
+                dimension
+            """,
+        )
+        self.parser.add_argument(
+            "--fault_tolerance.min_replica_size",
+            type=int,
+            default=1,
+            help="The minimum number of FT replica for each step.",
+        )
+    def to_dict(self):
+        return self.args_dict
+    def parse_args(self, args_list: list = sys.argv[1:]):
+        args, cmd_args = self.parse_args_from_command_line(args_list)
+        config_file = getattr(args, "job.config_file", None)
+        # build up a two level dict
+        args_dict = self._args_to_two_level_dict(args)
+        if config_file is not None:
+            try:
+                with open(config_file, "rb") as f:
+                    for k, v in tomllib.load(f).items():
+                        # to prevent overwrite of non-specified keys
+                        args_dict[k] |= v
+            except (FileNotFoundError, tomllib.TOMLDecodeError) as e:
+                logger.exception(
+                    f"Error while loading the configuration file: {config_file}"
+                )
+                logger.exception(f"Error details: {str(e)}")
+                raise e
+        # Checking string-list arguments are properly split into a list
+        # if split-points came from 'args' (from cmd line) it would have already been parsed into a list by that parser
+        string_list_argnames = self._get_string_list_argument_names()
+        for n in string_list_argnames:
+            check_string_list_argument(args_dict, n)
+        # override args dict with cmd_args
+        cmd_args_dict = self._args_to_two_level_dict(cmd_args)
+        for section, section_args in cmd_args_dict.items():
+            for k, v in section_args.items():
+                args_dict[section][k] = v
+        self.args_dict = args_dict
+        for k, v in args_dict.items():
+            class_type = type(k.title(), (), v)
+            setattr(self, k, class_type())
+        self._validate_config()
+    def _args_to_two_level_dict(self, args: argparse.Namespace) -> defaultdict:
+        args_dict = defaultdict(defaultdict)
+        for k, v in vars(args).items():
+            first_level_key, second_level_key = k.split(".", 1)
+            args_dict[first_level_key][second_level_key] = v
+        return args_dict
+    def _validate_config(self) -> None:
+        # TODO: Add more mandatory validations
+        assert self.model.config
+        assert self.model.tokenizer_path
+    def _get_string_list_argument_names(self) -> list[str]:
+        """Get the parser argument names of type `string_list`."""
+        string_list_args = [
+            v.dest for v in self.parser._actions if v.type is string_list
+        ]
+        return string_list_args
+    def parse_args_from_command_line(
+        self, args_list
+    ) -> Tuple[argparse.Namespace, argparse.Namespace]:
+        """
+        Parse command line arguments and return the parsed args and the command line only args
+        """
+        args = self.parser.parse_args(args_list)
+        string_list_argnames = set(self._get_string_list_argument_names())
+        # aux parser to parse the command line only args, with no defaults from main parser
+        aux_parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS)
+        for arg, val in vars(args).items():
+            if isinstance(val, bool):
+                aux_parser.add_argument(
+                    "--" + arg, action="store_true" if val else "store_false"
+                )
+            elif arg in string_list_argnames:
+                # without this special case, type inference breaks here,
+                # since the inferred type is just 'list' and it ends up flattening
+                # e.g. from ["layers.0", "layers.1"] into ["l", "a", "y", "e", "r", "s", ".0", ...]
+                aux_parser.add_argument("--" + arg, type=string_list)
+            else:
+                aux_parser.add_argument("--" + arg, type=type(val))
+        cmd_args, _ = aux_parser.parse_known_args(args_list)
+        return args, cmd_args

flame/data.py ADDED Viewed

	@@ -0,0 +1,756 @@

+# -*- coding: utf-8 -*-
+from __future__ import annotations
+import copy
+import pickle
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+import datasets
+import numpy as np
+import torch
+from datasets import Dataset, IterableDataset, interleave_datasets, load_dataset
+from datasets.iterable_dataset import ShufflingConfig
+from torch.distributed.checkpoint.stateful import Stateful
+from torchdata.stateful_dataloader import StatefulDataLoader
+from transformers import PreTrainedTokenizer
+from torchtitan.tools import utils
+from torchtitan.tools.logging import logger
+class BufferShuffledIterableDataset(IterableDataset):
+    def __init__(
+        self,
+        dataset: Dataset,
+        tokenizer: PreTrainedTokenizer,
+        seq_len: int = 2048,
+        rank: int = 0,
+        world_size: int = 1,
+        buffer_size: int = 1024,
+    ) -> BufferShuffledIterableDataset:
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.data = dataset.shard(world_size, rank)
+        self.seq_len = seq_len
+        self.rank = rank
+        self.world_size = world_size
+        self.buffer_size = buffer_size
+        if tokenizer.vocab_size < torch.iinfo(torch.uint16).max:
+            self.dtype = torch.uint16
+        elif tokenizer.vocab_size < torch.iinfo(torch.uint32).max:
+            self.dtype = torch.uint32
+        else:
+            self.dtype = torch.uint64
+        self.states = None
+        self.buffer = torch.tensor([], dtype=self.dtype)
+        self.tokens = []
+        self.rand_id = 0
+        self.token_id = 0
+        self.rng_state = None
+        self._epoch = 0
+    def __iter__(self):
+        g = torch.Generator()
+        g.manual_seed(self._epoch + self.rank)
+        if self.rng_state is not None:
+            g.set_state(self.rng_state)
+        rand_it = self.randint(0, self.buffer_size, g=g)
+        if self.states is not None:
+            self.data.load_state_dict(self.states)
+        # max number of tokens allowed in the chunk buffer
+        n_tokens = self.buffer_size * self.seq_len
+        while True:
+            for sample in self.tokenize(self.data):
+                # keep appending the samples to the token buffer
+                self.tokens += sample
+                # if the token buffer is full, start sampling
+                # NOTE: we first convert the token ids to a tensor of shape [n_chunks, seq_len] for efficiency
+                if len(self.buffer) == 0 and len(self.tokens) >= n_tokens:
+                    self.buffer = torch.tensor(self.tokens[:n_tokens], dtype=self.dtype).view(self.buffer_size, -1)
+                    self.tokens = self.tokens[n_tokens:]
+                if len(self.buffer) == self.buffer_size:
+                    yield from self.sample(rand_it)
+            n_chunks = len(self.tokens) // self.seq_len
+            # handle the left tokens in the buffer
+            if n_chunks > 0:
+                n_tokens = n_chunks * self.seq_len
+                indices = torch.randperm(n_chunks, generator=g).tolist()
+                self.buffer = torch.tensor(self.tokens[:n_tokens], dtype=torch.long).view(n_chunks, -1)
+                self.tokens = self.tokens[n_tokens:]
+                for i in indices:
+                    yield {'input_ids': self.buffer[i]}
+    def tokenize(self, data, batch_size: int = 64):
+        texts, states = [], []
+        for sample in data:
+            texts.append(sample['text'])
+            states.append(self.data.state_dict())
+            if len(texts) == batch_size:
+                for s, tokenized in zip(states, self.tokenizer(texts, return_attention_mask=False)['input_ids']):
+                    self.states = s
+                    yield tokenized
+                texts, states = [], []
+        if len(texts) > 0:
+            for s, tokenized in zip(states, self.tokenizer(texts, return_attention_mask=False)['input_ids']):
+                self.states = s
+                yield tokenized
+    def sample(self, indices):
+        n_tokens = (len(self.tokens) // self.seq_len) * self.seq_len
+        while self.token_id < n_tokens:
+            i = next(indices)
+            start, end = self.token_id, self.token_id + self.seq_len
+            self.token_id += self.seq_len
+            yield {'input_ids': self.buffer[i].to(torch.long)}
+            self.buffer[i] = torch.tensor(self.tokens[start:end], dtype=self.dtype)
+        self.token_id = 0
+        self.tokens = self.tokens[n_tokens:]
+    def randint(self, low: int, high: int, buffer_size: int = 1024, g: torch.Generator = torch.Generator()) -> Iterable[int]:
+        indices = torch.empty(buffer_size, dtype=torch.long)
+        while True:
+            # record the generator states before sampling
+            self.rng_state = g.get_state()
+            indices = torch.randint(low, high, (buffer_size,), out=indices, generator=g)
+            for i in indices[self.rand_id:].tolist():
+                self.rand_id += 1
+                yield i
+            self.rand_id = 0
+    def set_epoch(self, epoch):
+        self._epoch = epoch
+        if hasattr(self.dataset, 'set_epoch'):
+            self.dataset.set_epoch(epoch)
+    def state_dict(self):
+        return {
+            'states': self.states,
+            'buffer': self.buffer.clone(),
+            'tokens': deepcopy(self.tokens),
+            'rand_id': self.rand_id,
+            'token_id': self.token_id,
+            'rng_state': self.rng_state,
+            'epoch': self._epoch,
+        }
+    def load_state_dict(self, state_dict):
+        self.states = state_dict['states']
+        self.buffer = state_dict['buffer'].clone()
+        self.tokens = deepcopy(state_dict['tokens'])
+        self.rand_id = state_dict['rand_id']
+        self.token_id = state_dict['token_id']
+        self.rng_state = state_dict['rng_state'].clone() if state_dict['rng_state'] is not None else None
+        self._epoch = state_dict['epoch']
+class OnlineTokenizedIterableDataset(IterableDataset):
+    def __init__(
+        self, dataset: Dataset, tokenizer: PreTrainedTokenizer, seq_len: int = 2048, rank: int = 0, world_size: int = 1
+    ) -> OnlineTokenizedIterableDataset:
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.data = dataset.shard(world_size, rank)
+        self.seq_len = seq_len
+        self.rank = rank
+        self.world_size = world_size
+        self.states = None
+        self.tokens = []
+    def __iter__(self):
+        if self.states is not None:
+            self.data.load_state_dict(self.states)
+        while True:
+            for sample in self.tokenize(self.data):
+                # keep appending the samples to the token buffer
+                self.tokens += sample
+                while len(self.tokens) >= self.seq_len:
+                    input_ids = torch.tensor(self.tokens[:self.seq_len], dtype=torch.long)
+                    self.tokens = self.tokens[self.seq_len:]
+                    yield {'input_ids': input_ids}
+    def tokenize(self, data, buffer_size: int = 64):
+        buffer, states = [], []
+        for sample in data:
+            if sample.get('text', None) is not None:
+                buffer.append(sample['text'])
+            elif sample.get('content', None) is not None:
+                buffer.append(sample['content'])
+            else:
+                raise ValueError(f"No 'text' or 'content' field found in sample:\n{sample}")
+            states.append(self.data.state_dict())
+            if len(buffer) == buffer_size:
+                for s, tokenized in zip(states, self.tokenizer(buffer, return_attention_mask=False)['input_ids']):
+                    self.states = s
+                    yield tokenized
+                buffer, states = [], []
+        if len(buffer) > 0:
+            for s, tokenized in zip(states, self.tokenizer(buffer, return_attention_mask=False)['input_ids']):
+                self.states = s
+                yield tokenized
+    def state_dict(self):
+        return {'states': self.states, 'tokens': deepcopy(self.tokens)}
+    def load_state_dict(self, state_dict):
+        self.states = state_dict['states']
+        self.tokens = deepcopy(state_dict['tokens'])
+class BufferShuffledExamplesIterable(datasets.iterable_dataset.BufferShuffledExamplesIterable):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def _init_state_dict(self) -> dict:
+        self._state_dict = self.ex_iterable._init_state_dict()
+        self._state_dict['mem_buffer'] = ([],)
+        self._state_dict['bit_generator_state'] = self.generator.bit_generator.state
+        self._state_dict['bit_generator_index_offset'] = 0
+        self._state_dict['bit_generator_index_offset_shuffle'] = 0
+        return self._state_dict
+    def __iter__(self):
+        buffer_size = self.buffer_size
+        rng = deepcopy(self.generator)
+        # this is the shuffle buffer that we keep in memory
+        mem_buffer = self._state_dict['mem_buffer'][0]
+        # this is an infinite iterator that randomly samples the index of the source to pick examples from
+        index_offset = self._state_dict['bit_generator_index_offset'] if self._state_dict else 0
+        if self._state_dict:
+            rng.bit_generator.state = self._state_dict['bit_generator_state']
+        indices_iterator = self._iter_random_indices(rng, buffer_size, random_batch_size=buffer_size)
+        # skip already consumed ones
+        for _ in range(index_offset):
+            i = next(indices_iterator)
+        for x in self.ex_iterable:
+            if len(mem_buffer) < buffer_size:  # if the buffer is not full, keep filling the buffer
+                mem_buffer.append(x)
+            else:  # otherwise, pick an example from it
+                i = next(indices_iterator)
+                index_offset = (index_offset + 1) % buffer_size
+                if self._state_dict:
+                    self._state_dict['bit_generator_index_offset'] = index_offset
+                    if index_offset == 0:
+                        self._state_dict['bit_generator_state'] = rng.bit_generator.state
+                selected = mem_buffer[i]
+                mem_buffer[i] = x  # replace the picked example by a new one
+                yield selected
+        index_offset = self._state_dict['bit_generator_index_offset_shuffle'] if self._state_dict else 0
+        if self._state_dict:
+            rng.bit_generator.state = self._state_dict['bit_generator_state']
+        # when we run out of examples, we shuffle the remaining examples in the buffer and yield them
+        for i in rng.permutation(len(mem_buffer))[index_offset:].tolist():
+            index_offset = index_offset + 1
+            if self._state_dict:
+                self._state_dict['bit_generator_index_offset_shuffle'] = index_offset
+            yield mem_buffer[i]
+    def shuffle_data_sources(self, generator: np.random.Generator) -> BufferShuffledExamplesIterable:
+        """Shuffle the wrapped examples iterable as well as the shuffling buffer."""
+        return BufferShuffledExamplesIterable(
+            self.ex_iterable.shuffle_data_sources(generator), buffer_size=self.buffer_size, generator=generator
+        )
+    def shard_data_sources(self, num_shards: int, index: int, contiguous=True) -> BufferShuffledExamplesIterable:
+        """Keep only the requested shard."""
+        return BufferShuffledExamplesIterable(
+            self.ex_iterable.shard_data_sources(num_shards, index, contiguous=contiguous),
+            buffer_size=self.buffer_size,
+            generator=self.generator,
+        )
+    def load_state_dict(self, state_dict: dict) -> dict:
+        def _inner_load_state_dict(state, new_state):
+            if new_state is not None and isinstance(state, dict):
+                for key in new_state:
+                    state[key] = _inner_load_state_dict(state[key], new_state[key])
+                return state
+            elif new_state is not None and isinstance(state, list):
+                for i in range(len(state)):
+                    state[i] = _inner_load_state_dict(state[i], new_state[i])
+                return state
+            return new_state
+        return _inner_load_state_dict(self._state_dict, state_dict)
+def shuffle(
+    dataset: IterableDataset,
+    seed: int = 42,
+    generator: np.random.Generator = None,
+    buffer_size: int = 1024,
+):
+    generator = np.random.default_rng(seed) if generator is None else deepcopy(generator)
+    return IterableDataset(
+        ex_iterable=BufferShuffledExamplesIterable(dataset._ex_iterable, buffer_size=buffer_size, generator=generator),
+        info=dataset._info.copy(),
+        split=dataset._split,
+        formatting=dataset._formatting,
+        shuffling=ShufflingConfig(generator=generator, _original_seed=seed),
+        distributed=copy.deepcopy(dataset._distributed),
+        token_per_repo_id=dataset._token_per_repo_id,
+    )
+@dataclass
+class DataCollatorForLanguageModeling:
+    """
+    Data collator used for language modeling. Inputs are dynamically padded if `varlen=False`.
+    If `varlen=True`, sequences are expected to be concatenated, and labels match inputs.
+    Args:
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+            The tokenizer used for encoding the data.
+        context_len (`int`, optional):
+            When `varlen=True`, sequences longer than this length within a document
+            (as determined by `cu_seqlens`) will be further chunked.
+        varlen (`bool`):
+            Whether to handle variable length concatenated sequences (`True`) or padded batches (`False`).
+    Returns:
+        A dictionary with the following keys:
+        - `input_ids`: Tensor of input IDs. Shape `[batch_size, seq_len]` if `varlen=False`, `[1, total_len]` if `varlen=True`.
+        - `labels`: Tensor of labels. Shape matches `input_ids`. Padding positions are masked with -100 if `varlen=False`.
+        - `attention_mask`: Tensor indicating non-padding tokens (only if `varlen=False`). Shape matches `input_ids`.
+        - `cu_seqlens`: Tensor of cumulative sequence lengths (only if `varlen=True`). Shape `[1, num_sequences + 1]`.
+    NOTE: When `varlen=True`, the `batch_size` must be 1.
+    """
+    tokenizer: PreTrainedTokenizer
+    context_len: Optional[int] = None
+    varlen: bool = False
+    def __call__(self, examples: List[Union[List[int], Dict[str, Any]]]) -> Dict[str, Any]:
+        if not isinstance(examples[0], Dict):
+            examples = [{'input_ids': example} for example in examples]
+        def tensorize(example: Dict[str, Any]) -> Dict[str, Any]:
+            tensorized = {}
+            for key in ['input_ids', 'cu_seqlens']:
+                if key not in example:
+                    continue
+                if isinstance(example[key], List):
+                    tensorized[key] = torch.tensor(example[key], dtype=torch.long)
+                elif isinstance(example[key], np.ndarray):
+                    tensorized[key] = torch.from_numpy(example[key])
+                else:
+                    tensorized[key] = example[key]
+            return tensorized
+        examples = list(map(tensorize, examples))
+        if not self.varlen:
+            # --- Handling for varlen=False (Batch Padding) ---
+            length_of_first = examples[0]['input_ids'].size(0)
+            needs_padding = not all(example['input_ids'].size(0) == length_of_first for example in examples)
+            if needs_padding:
+                # Check for pad token if padding is actually required
+                if self.tokenizer.pad_token_id is None:
+                    raise ValueError(
+                        f'You are attempting to pad samples but the tokenizer you are using '
+                        f'({self.tokenizer.__class__.__name__}) does not have a pad token.'
+                    )
+                # Pad using the tokenizer, ensuring attention_mask is returned
+                batch = self.tokenizer.pad(examples, return_tensors='pt', return_attention_mask=True)
+            else:
+                # No padding needed, stack directly and create a full attention mask
+                input_ids = torch.stack([example['input_ids'] for example in examples], dim=0)
+                batch = {
+                    'input_ids': input_ids,
+                    # Create attention mask of all ones
+                    'attention_mask': torch.ones_like(input_ids),
+                }
+            # Create labels by cloning input_ids
+            labels = batch['input_ids'].clone()
+            # Mask labels only where attention_mask is 0 (padding positions)
+            if 'attention_mask' in batch:
+                labels[batch['attention_mask'] == 0] = -100
+            batch['labels'] = labels
+        else:
+            # --- Handling for varlen=True (Concatenated Sequences) ---
+            if len(examples) > 1:
+                raise ValueError('The batch size must be 1 for inputs with variable lengths (varlen=True).')
+            batch = {'input_ids': torch.cat([example['input_ids'] for example in examples], dim=0).unsqueeze(0)}
+            # --- cu_seqlens calculation logic remains the same ---
+            if 'cu_seqlens' in examples[0]:
+                batch['cu_seqlens'] = (
+                    torch.cat([example['cu_seqlens'] for example in examples], dim=0).unsqueeze(0).to(dtype=torch.int32)
+                )  # Ensure int32
+            else:
+                # determine boundaries by bos/eos positions
+                # Check for bos_token_id first
+                if self.tokenizer.bos_token_id is not None:
+                    cu_seqlens = []
+                    # Handle case where the sequence doesn't start with BOS
+                    if batch['input_ids'][0, 0] != self.tokenizer.bos_token_id:
+                        cu_seqlens.append(torch.tensor([0], device=batch['input_ids'].device))  # Match device
+                    # Find all BOS token positions
+                    bos_positions = torch.where(batch['input_ids'].eq(self.tokenizer.bos_token_id))[1]
+                    # Ensure bos_positions is on the correct device if empty
+                    if bos_positions.numel() == 0 and len(cu_seqlens) > 0:
+                        cu_seqlens.append(bos_positions.to(cu_seqlens[0].device))
+                    elif bos_positions.numel() > 0:
+                        cu_seqlens.append(bos_positions)
+                    # Add the end of the entire batch
+                    cu_seqlens.append(
+                        torch.tensor([batch['input_ids'].size(1)], device=batch['input_ids'].device)
+                    )  # Match device and use size(1)
+                    # Filter out empty tensors before cat
+                    cu_seqlens = [t for t in cu_seqlens if t.numel() > 0]
+                    if not cu_seqlens:  # Handle case where input is empty or has no BOS
+                        batch['cu_seqlens'] = torch.tensor(
+                            [0, batch['input_ids'].size(1)], dtype=torch.int32, device=batch['input_ids'].device
+                        )
+                    else:
+                        batch['cu_seqlens'] = torch.cat(cu_seqlens, dim=0).to(dtype=torch.int32)
+                # Else, check for eos_token_id
+                elif self.tokenizer.eos_token_id is not None:
+                    cu_seqlens = [torch.tensor([0], device=batch['input_ids'].device)]  # Match device
+                    # Find positions *after* EOS tokens
+                    eos_positions = torch.where(batch['input_ids'].eq(self.tokenizer.eos_token_id))[1] + 1
+                    # Ensure eos_positions is on the correct device if empty
+                    if eos_positions.numel() > 0:
+                        cu_seqlens.append(eos_positions)
+                    # Handle case where the sequence doesn't end with EOS
+                    if batch['input_ids'][0, -1] != self.tokenizer.eos_token_id:
+                        # Only add the final length if the last found EOS wasn't already the end
+                        if eos_positions.numel() == 0 or eos_positions[-1] != batch['input_ids'].size(1):
+                            cu_seqlens.append(
+                                torch.tensor([batch['input_ids'].size(1)], device=batch['input_ids'].device)
+                            )  # Match device and use size(1)
+                    # Filter out empty tensors before cat
+                    cu_seqlens = [t for t in cu_seqlens if t.numel() > 0]
+                    if not cu_seqlens:  # Handle case where input is empty or has no EOS
+                        batch['cu_seqlens'] = torch.tensor(
+                            [0, batch['input_ids'].size(1)], dtype=torch.int32, device=batch['input_ids'].device
+                        )
+                    else:
+                        batch['cu_seqlens'] = torch.cat(cu_seqlens, dim=0).to(dtype=torch.int32)
+                # Else, neither BOS nor EOS is usable
+                else:
+                    raise ValueError(
+                        'For varlen=True without precomputed cu_seqlens, the tokenizer must have either a bos_token_id '
+                        'or an eos_token_id defined to act as sequence separators.'
+                    )
+                # --- cu_seqlens validation checks remain the same ---
+                if batch['cu_seqlens'].numel() < 2:
+                    raise ValueError(f'Calculated cu_seqlens must have at least start and end: {batch["cu_seqlens"]}')
+                if not torch.all(batch['cu_seqlens'][1:] >= batch['cu_seqlens'][:-1]):
+                    raise ValueError(f'Calculated cu_seqlens are not monotonically increasing: {batch["cu_seqlens"]}')
+                if batch['cu_seqlens'][0] != 0:
+                    raise ValueError(f'Calculated cu_seqlens do not start at 0: {batch["cu_seqlens"]}')
+                if batch['cu_seqlens'][-1] != batch['input_ids'].size(1):
+                    # Allow empty sequence case where cu_seqlens=[0, 0] and input_ids.size(1)=0
+                    if not (batch['cu_seqlens'].tolist() == [0, 0] and batch['input_ids'].size(1) == 0):
+                        raise ValueError(
+                            f'Calculated cu_seqlens do not end at total length {batch["input_ids"].size(1)}: '
+                            f'{batch["cu_seqlens"]}'
+                        )
+                # --- context_len splitting logic remains the same ---
+                if self.context_len is not None:
+                    # This logic splits sequences based on context_len *after* initial boundaries are found
+                    bos = batch['cu_seqlens'][:-1].tolist()
+                    eos = batch['cu_seqlens'][1:].tolist()
+                    # Handle empty sequences between boundaries
+                    split_boundaries = []
+                    for i, j in zip(bos, eos):
+                        if i < j:  # Only process non-empty sequences
+                            split_boundaries.append(torch.arange(i, j, self.context_len, device=batch['input_ids'].device))
+                    # Add the final end point if it wasn't included by arange
+                    final_end_point = torch.tensor([batch['input_ids'].size(1)], device=batch['input_ids'].device)
+                    # Concatenate all boundaries
+                    if not split_boundaries:  # Handle case of completely empty input
+                        batch['cu_seqlens'] = torch.tensor([0, 0], dtype=torch.int32, device=batch['input_ids'].device)
+                    else:
+                        batch['cu_seqlens'] = torch.cat(split_boundaries + [final_end_point]).to(dtype=torch.int32)
+                        # Ensure uniqueness and sort, as arange might duplicate the endpoint
+                        batch['cu_seqlens'] = torch.unique(batch['cu_seqlens'])
+            # Create labels directly from input_ids, NO padding mask needed for varlen
+            labels = batch['input_ids'].clone()
+            batch['labels'] = labels
+        return batch
+class ParallelAwareDataLoader(StatefulDataLoader, Stateful):
+    """
+    A wrapper around the StatefulDataLoader that ensures that the state is stored only once per DP rank.
+    """
+    def __init__(
+        self,
+        rank: int,
+        dataset: IterableDataset,
+        batch_size: int,
+        collate_fn: Callable,
+        num_workers: int = 0,
+        pin_memory: bool = False,
+        prefetch_factor: int = 2,
+        persistent_workers: bool = False,
+        snapshot_every_n_steps: Optional[int] = 1,
+    ):
+        super().__init__(
+            dataset=dataset,
+            batch_size=batch_size,
+            collate_fn=collate_fn,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            prefetch_factor=prefetch_factor,
+            persistent_workers=persistent_workers,
+            snapshot_every_n_steps=snapshot_every_n_steps,
+        )
+        self.rank = rank
+    def state_dict(self) -> Dict[str, Any]:
+        # Store state only for dp rank to avoid replicating the same state across other dimensions
+        return {f'rank_{self.rank}': pickle.dumps(super().state_dict())}
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        # State being empty is valid
+        if not state_dict:
+            return
+        if f'rank_{self.rank}' not in state_dict:
+            logger.warning(f'DataLoader state is empty for dp rank {self.rank}, expected key rank_{self.rank}')
+            return
+        super().load_state_dict(pickle.loads(state_dict[f'rank_{self.rank}']))
+def build_dataset(
+    dataset: str,
+    dataset_name: str = None,
+    dataset_split: str = 'train',
+    data_dir: str = None,
+    data_files: str = None,
+    data_probs: List[float] = None,
+    streaming: bool = False,
+    dp_degree: Optional[int] = None,
+    num_workers: int = 32,
+    seed: Optional[int] = None,
+) -> IterableDataset:
+    color = utils.Color
+    min_num_shards = dp_degree * num_workers if dp_degree else None
+    if len(dataset.split(',')) == 1:
+        dataset = load_dataset(
+            path=dataset,
+            name=dataset_name,
+            split=dataset_split,
+            data_dir=data_dir,
+            data_files=data_files,
+            trust_remote_code=True,
+            streaming=streaming,
+            num_proc=num_workers if not streaming else None,
+        )
+        logger.info(f"Shuffling the dataset with seed {seed}")
+        if not streaming:
+            # the states of map-style dataset is recoverable after shuffling
+            if seed is not None:
+                dataset = dataset.shuffle(seed=seed)
+            if min_num_shards is not None:
+                dataset = dataset.to_iterable_dataset(num_shards=min_num_shards)
+        else:
+            if min_num_shards is not None and dataset.num_shards < min_num_shards:
+                logger.warning(
+                    f"{color.red}"
+                    f"Dataset {dataset} has insufficient shards ({dataset.num_shards}). "
+                    f"Need {min_num_shards} shards minimum for {dp_degree} data parallel workers × "
+                    f"{num_workers} dataloader workers. "
+                    f"Disabling the streaming mode and resharding dataset to {min_num_shards} shards."
+                    f"{color.reset}"
+                )
+                dataset = load_dataset(
+                    path=dataset,
+                    name=dataset_name,
+                    split=dataset_split,
+                    data_dir=data_dir,
+                    data_files=data_files,
+                    trust_remote_code=True,
+                    streaming=False,
+                    num_proc=num_workers,
+                )
+                if seed is not None:
+                    dataset = dataset.shuffle(seed=seed)
+                dataset = dataset.to_iterable_dataset(num_shards=min_num_shards)
+            else:
+                if seed is not None:
+                    dataset = shuffle(dataset, seed=seed)
+    else:
+        datasets = dataset.split(",")
+        if dataset_name is not None:
+            dataset_names = [
+                name or None for name in dataset_name.split(",")
+            ]
+            assert len(dataset_names) == len(datasets), (
+                "The number of dataset names must match the number of datasets"
+            )
+        else:
+            dataset_names = [None] * len(datasets)
+        if dataset_split is not None:
+            dataset_splits = [split or "train"for split in dataset_split.split(",")]
+            assert len(dataset_splits) == len(datasets), (
+                "The number of dataset splits must match the number of datasets"
+            )
+        else:
+            dataset_splits = ["train"] * len(datasets)
+        if data_dir is not None:
+            data_dirs = [
+                data_dir or None for data_dir in data_dir.split(",")
+            ]
+            assert len(data_dirs) == len(datasets), (
+                "The number of data dirs must match the number of datasets"
+            )
+        else:
+            data_dirs = [None] * len(datasets)
+        if data_files is not None:
+            data_files = data_files.split(",")
+            assert len(data_files) == len(datasets), (
+                "The number of data files must match the number of datasets"
+            )
+        else:
+            data_files = [None] * len(datasets)
+        if data_probs is not None:
+            data_probs = [float(p) for p in data_probs.split(",")]
+            assert len(data_probs) == len(datasets), (
+                "The number of data probabilities must match the number of datasets"
+            )
+        else:
+            raise ValueError(
+                "Data sampling probabilities are required if using multiple datasets"
+            )
+        subsets = []
+        for i, prob in enumerate(data_probs):
+            subset = load_dataset(
+                path=datasets[i],
+                name=dataset_names[i],
+                split=dataset_splits[i],
+                data_dir=data_dirs[i],
+                data_files=data_files[i],
+                trust_remote_code=True,
+                streaming=streaming,
+                num_proc=(
+                    num_workers
+                    if not streaming
+                    else None
+                ),
+            )
+            logger.info(
+                f"Subset {color.cyan}{datasets[i]}"
+                + (f":{dataset_names[i]} " if dataset_names[i] else " ")
+                + f"(p = {prob:.3f}){color.reset}:\n"
+                + f"{subset}"
+            )
+            logger.info(f"Shuffling the dataset with seed {seed}")
+            if not streaming:
+                # the states of map-style dataset is recoverable after shuffling
+                if seed is not None:
+                    subset = subset.shuffle(seed=seed)
+                if min_num_shards is not None:
+                    subset = subset.to_iterable_dataset(num_shards=min_num_shards)
+            else:
+                if min_num_shards is not None and subset.num_shards < min_num_shards:
+                    logger.warning(
+                        f"{color.red}"
+                        f"Dataset {datasets[i]} has insufficient shards ({subset.num_shards}). "
+                        f"Need {min_num_shards} shards minimum for desired data parallel workers × "
+                        f"{num_workers} dataloader workers. "
+                        f"Resharding dataset to {min_num_shards} shards and disabling streaming mode."
+                        f"{color.reset}"
+                    )
+                    # again, it's ok to directly shuffle the map-style dataset
+                    # we expect an error raised if the map-style dataset still has not enough data shards
+                    subset = load_dataset(
+                        path=datasets[i],
+                        name=dataset_names[i],
+                        split=dataset_splits[i],
+                        data_dir=data_dirs[i],
+                        data_files=data_files[i],
+                        trust_remote_code=True,
+                        streaming=False,
+                        num_proc=num_workers,
+                    )
+                    if seed is not None:
+                        subset = subset.shuffle(seed=seed)
+                    subset = subset.to_iterable_dataset(num_shards=min_num_shards)
+                else:
+                    # we set relatively small buffer size here as interleaving could provide some randomness
+                    if seed is not None:
+                        subset = shuffle(subset, seed=seed, buffer_size=max(128, 1024 // len(datasets)))
+            if "text" in subset.column_names:
+                subset = subset.select_columns("text")
+            elif "content" in subset.column_names:
+                subset = subset.select_columns("content")
+            else:
+                raise ValueError(
+                    f"Subset {datasets[i]} has no 'text' or 'content' column"
+                )
+            subsets.append(subset)
+        logger.info(
+            f"Interleaving {len(subsets)} datasets with probabilities {data_probs}"
+        )
+        dataset = interleave_datasets(
+            datasets=subsets,
+            probabilities=data_probs,
+            stopping_strategy="all_exhausted",
+            seed=seed,
+        )
+    logger.info(f"{dataset}")
+    return dataset
+def build_dataloader(
+    dataset: IterableDataset,
+    tokenizer: PreTrainedTokenizer,
+    rank: int,
+    world_size: int,
+    batch_size: int,
+    seq_len: int,
+    context_len: Optional[int] = None,
+    varlen: bool = False,
+    num_workers: int = 0,
+    pin_memory: bool = False,
+    persistent_workers: bool = False,
+    snapshot_every_n_steps: Optional[int] = 1,
+):
+    dataset = OnlineTokenizedIterableDataset(
+        dataset=dataset, tokenizer=tokenizer, seq_len=seq_len, rank=rank, world_size=world_size
+    )
+    return ParallelAwareDataLoader(
+        rank=rank,
+        dataset=dataset,
+        batch_size=batch_size,
+        collate_fn=DataCollatorForLanguageModeling(tokenizer=tokenizer, context_len=context_len, varlen=varlen),
+        num_workers=num_workers,
+        pin_memory=pin_memory,
+        persistent_workers=persistent_workers,
+        snapshot_every_n_steps=snapshot_every_n_steps,
+    )

flame/models/fla.toml ADDED Viewed

	@@ -0,0 +1,67 @@

+[model]
+config = "fla-hub/transformer-1.3B-100B"
+tokenizer_path = "fla-hub/transformer-1.3B-100B"
+[job]
+dump_folder = "exp"
+print_args = true
+[training]
+batch_size = 32
+seq_len = 2048
+context_len = 2048
+gradient_accumulation_steps = 1
+steps = 20480
+max_norm = 1.0
+skip_nan_inf = true
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = -1
+tensor_parallel_degree = 1
+compile = false
+dataset = "HuggingFaceFW/fineweb-edu"
+dataset_name = "default"
+num_workers = 32
+pin_memory = false
+persistent_workers = false
+prefetch_factor = 2
+seed = 42
+varlen = false
+[optimizer]
+name = "AdamW"
+eps = 1e-15
+lr = 3e-4
+[lr_scheduler]
+warmup_steps = 1024
+decay_type = "cosine"
+lr_min = 0.1
+[checkpoint]
+enable_checkpoint = true
+folder = "checkpoint"
+interval_type = "steps"
+interval = 2048
+model_weights_only = false
+export_dtype = "float32"
+async_mode = "disabled"    # ["disabled", "async", "async_with_pinned_mem"]
+[profiling]
+enable_profiling = true
+save_traces_folder = "profile_trace"
+profile_freq = 512
+[metrics]
+log_freq = 32
+enable_wandb = true
+[experimental]
+context_parallel_degree = 1
+pipeline_parallel_degree = 1
+[float8]
+enable_fsdp_float8_all_gather = false
+precompute_float8_dynamic_scale_for_fsdp = false
+[activation_checkpoint]
+mode = "none"

flame/train.py ADDED Viewed

	@@ -0,0 +1,624 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import time
+from datetime import timedelta
+import fla  # noqa
+import fla.models.gsa
+import fla.models.routmem
+import torch
+from fla.modules.fused_linear_cross_entropy import FusedLinearCrossEntropyLoss
+from fla.ops.utils import prepare_position_ids
+from torch.distributed.elastic.multiprocessing.errors import record
+from torchtitan.components.checkpoint import CheckpointManager
+from torchtitan.components.ft import FTParallelDims, init_ft_manager
+from torchtitan.components.loss import build_cross_entropy_loss
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.metrics import build_device_memory_monitor, build_metrics_processor, ensure_pp_loss_visible
+from torchtitan.components.optimizer import build_optimizers
+from torchtitan.distributed import ParallelDims
+from torchtitan.distributed import utils as dist_utils
+from torchtitan.protocols.model_converter import build_model_converters
+from torchtitan.protocols.train_spec import TrainSpec, get_train_spec, register_train_spec
+from torchtitan.tools import utils
+from torchtitan.tools.logging import init_logger, logger
+from torchtitan.tools.profiling import maybe_enable_memory_snapshot, maybe_enable_profiling
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+import custom_models
+from flame.components.checkpoint import TrainState
+from flame.config_manager import JobConfig
+from flame.data import build_dataloader, build_dataset
+from flame.models.parallelize_fla import parallelize_fla
+from flame.models.pipeline_fla import pipeline_fla
+from flame.tools.utils import get_nparams_and_flops
+def build_tokenizer(job_config: JobConfig) -> AutoTokenizer:
+    return AutoTokenizer.from_pretrained(job_config.model.tokenizer_path)
+register_train_spec(
+    TrainSpec(
+        name="fla",
+        cls=AutoModelForCausalLM,
+        config=AutoConfig,
+        parallelize_fn=parallelize_fla,
+        pipelining_fn=pipeline_fla,
+        build_optimizers_fn=build_optimizers,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_dataloader,
+        build_tokenizer_fn=build_tokenizer,
+        build_loss_fn=build_cross_entropy_loss,
+    )
+)
+# Enable debug tracing on failure: https://pytorch.org/docs/stable/elastic/errors.html
+@record
+def main(job_config: JobConfig):
+    logger.info(f"Starting job: {job_config.job.description}")
+    if job_config.experimental.custom_model_path:
+        utils.import_module_from_path(job_config.experimental.custom_model_path)
+    # used for colorful printing
+    color = utils.NoColor if job_config.metrics.disable_color_printing else utils.Color
+    if job_config.job.print_args:
+        logger.info(
+            f"{color.green}{json.dumps(job_config.to_dict(), indent=2, sort_keys=True)}{color.reset}"
+        )
+    # take control of garbage collection to avoid stragglers
+    gc_handler = utils.GarbageCollection(gc_freq=job_config.training.gc_freq)
+    device_module, device_type = utils.device_module, utils.device_type
+    device = torch.device(f"{device_type}:{int(os.environ['LOCAL_RANK'])}")
+    # Device has to be set before creating TorchFT manager.
+    device_module.set_device(device)
+    ft_manager = init_ft_manager(job_config)
+    # init distributed
+    world_size = int(os.environ["WORLD_SIZE"])
+    if not ft_manager.enabled:
+        parallel_dims = ParallelDims(
+            dp_shard=job_config.training.data_parallel_shard_degree,
+            dp_replicate=job_config.training.data_parallel_replicate_degree,
+            cp=job_config.experimental.context_parallel_degree,
+            tp=job_config.training.tensor_parallel_degree,
+            pp=job_config.experimental.pipeline_parallel_degree,
+            world_size=world_size,
+            enable_loss_parallel=not job_config.training.disable_loss_parallel,
+        )
+    else:
+        parallel_dims = FTParallelDims(
+            dp_shard=job_config.training.data_parallel_shard_degree,
+            dp_replicate=job_config.training.data_parallel_replicate_degree,
+            cp=job_config.experimental.context_parallel_degree,
+            tp=job_config.training.tensor_parallel_degree,
+            pp=job_config.experimental.pipeline_parallel_degree,
+            world_size=world_size,
+            enable_loss_parallel=not job_config.training.disable_loss_parallel,
+            ft_manager=ft_manager,
+        )
+    dist_utils.init_distributed(job_config)
+    # initialize device memory monitor and get peak flops for MFU calculation
+    device_memory_monitor = build_device_memory_monitor()
+    gpu_peak_flops = utils.get_peak_flops(device_memory_monitor.device_name)
+    logger.info(f"Peak FLOPS used for computing MFU: {gpu_peak_flops:.3e}")
+    # build meshes
+    world_mesh = parallel_dims.build_mesh(device_type=device_type)
+    if parallel_dims.dp_enabled:
+        dp_mesh = world_mesh["dp"]
+        dp_degree, dp_rank = dp_mesh.size(), dp_mesh.get_local_rank()
+    else:
+        dp_degree, dp_rank = 1, 0
+    if parallel_dims.pp_enabled:
+        raise NotImplementedError(
+            "Pipeline parallelism is not supported in this version"
+        )
+        """
+        ! TODO[flame]: We need to fix the pipeline parallelism for flame
+        [x] Match the key of models' components with the actual naming
+        [ ] Fix the post-init and tie-embedding for pipeline parallelism, HF's transformer automatically
+            forces to tie if head is None, we need to handle this case
+        [ ]
+        """
+        pp_mesh = world_mesh["pp"]
+    # Set random seed, and maybe enable deterministic mode (mainly for debugging, expect perf loss)
+    dist_utils.set_determinism(
+        world_mesh, device, job_config.training.seed, job_config.training.deterministic
+    )
+    train_spec = get_train_spec(job_config.model.name)
+    logger.info("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        job_config.model.tokenizer_path,
+        trust_remote_code=True,
+        model_max_length=int(1e10),
+    )
+    logger.info(f"{tokenizer}")
+    logger.info(
+        f"Loading dataset {job_config.training.dataset}"
+        f":{job_config.training.dataset_name}"
+        if job_config.training.dataset_name is not None
+        else ""
+    )
+    dataset = build_dataset(
+        dataset=job_config.training.dataset,
+        dataset_name=job_config.training.dataset_name,
+        dataset_split=job_config.training.dataset_split,
+        data_dir=job_config.training.data_dir,
+        data_files=job_config.training.data_files,
+        data_probs=job_config.training.data_probs,
+        streaming=job_config.training.streaming,
+        dp_degree=dp_degree,
+        num_workers=job_config.training.num_workers,
+        seed=job_config.training.seed,
+    )
+    logger.info("Building dataloader...")
+    dataloader = build_dataloader(
+        dataset=dataset,
+        tokenizer=tokenizer,
+        rank=dp_rank,
+        world_size=dp_degree,
+        batch_size=job_config.training.batch_size,
+        seq_len=job_config.training.seq_len,
+        context_len=job_config.training.context_len,
+        varlen=job_config.training.varlen,
+        num_workers=job_config.training.num_workers,
+        pin_memory=job_config.training.pin_memory,
+        persistent_workers=job_config.training.persistent_workers,
+        snapshot_every_n_steps=job_config.checkpoint.interval,
+    )
+    logger.info(f"Loading model config from {job_config.model.config}")
+    model_config = AutoConfig.from_pretrained(job_config.model.config)
+    # set the model configs from training inputs:
+    # 1. norm type to decide which norm layer to use
+    # 2. disable fused norm if TP is enabled
+    # 3. vocab size from tokenizer
+    # 4. context_len base on inputs
+    if parallel_dims.tp_enabled:
+        if model_config.fuse_norm:
+            logger.warning(
+                f"{color.red}"
+                f"Fused norm is not compatible with tensor parallelism. "
+                f"Disabling it for now."
+                f"{color.reset}"
+            )
+            model_config.fuse_norm = False
+    if parallel_dims.loss_parallel_enabled:
+        if model_config.fuse_linear_cross_entropy:
+            logger.warning(
+                f"{color.red}"
+                f"Loss parallel enabled. Disabling fused cross entropy for now."
+                f"{color.reset}"
+            )
+            model_config.fuse_linear_cross_entropy = False
+    model_config.vocab_size = max(tokenizer.vocab_size, model_config.vocab_size)
+    logger.info(
+        f"Building model from the config\n{color.green}{model_config}{color.reset}"
+    )
+    with torch.device("meta"):
+        model = AutoModelForCausalLM.from_config(model_config)
+        if (
+            getattr(model_config, "fuse_linear_cross_entropy", False)
+            and FusedLinearCrossEntropyLoss is not None
+        ):
+            model.criterion = FusedLinearCrossEntropyLoss(
+                num_chunks=8 // parallel_dims.tp
+            )
+        # defer weight initialization until after parallelisms are applied
+        model.apply(lambda m: setattr(m, "_is_hf_initialized", False))
+    logger.info(f"{color.blue}\n{model}{color.reset}\n")
+    # Build the collection of model converters. No-op if `model.converters` empty
+    model_converters = build_model_converters(job_config, parallel_dims)
+    model_converters.convert(model)
+    # calculate model size and flops per token
+    model_param_count, num_flops_per_token = get_nparams_and_flops(
+        model, model_config, job_config.training.context_len
+    )
+    # move sharded model to CPU/GPU and initialize weights via DTensor
+    if job_config.checkpoint.create_seed_checkpoint:
+        init_device = "cpu"
+    elif job_config.training.enable_cpu_offload:
+        init_device = "cpu"
+    else:
+        init_device = device_type
+    # apply parallelisms and initialization
+    if parallel_dims.pp_enabled:
+        # apply PT-D Pipeline Parallel
+        (
+            pp_schedule,
+            model_parts,
+            has_first_stage,
+            has_last_stage,
+        ) = train_spec.pipelining_fn(
+            model,
+            pp_mesh,
+            parallel_dims,
+            job_config,
+            device,
+            model_config,
+            train_spec.loss_fn,
+        )
+        # when PP is enabled, `model` obj is no longer used after this point, model_parts is used instead
+        del model
+        # For PP with looped schedules, each item in model_parts is one stage-model-chunk.
+        # We need to iterate through model_parts to apply SPMD parallelisms, compilation,
+        # optimizer, and checkpointing
+        for m in model_parts:
+            # apply SPMD-style PT-D techniques
+            train_spec.parallelize_fn(m, world_mesh, parallel_dims, job_config)
+            m.to_empty(device=init_device)
+            with torch.no_grad():
+                m.post_init()
+            m.train()
+        # confirm that user will be able to view loss metrics on the console
+        ensure_pp_loss_visible(parallel_dims, job_config, color)
+    else:
+        # apply PT-D Tensor Parallel, activation checkpointing, torch.compile, Data Parallel
+        train_spec.parallelize_fn(model, world_mesh, parallel_dims, job_config)
+        model.to_empty(device=init_device)
+        with torch.no_grad():
+            model.post_init()
+        model.train()
+        model_parts = [model]
+    device_mem_stats = device_memory_monitor.get_peak_stats()
+    logger.info(
+        f"{device_type.upper()} memory usage for model: "
+        f"{device_mem_stats.max_reserved_gib:.2f}GiB"
+        f"({device_mem_stats.max_reserved_pct:.2f}%)"
+    )
+    # build optimizer after applying parallelisms to the model
+    optimizers = train_spec.build_optimizers_fn(model_parts, job_config, ft_manager)
+    lr_schedulers = train_spec.build_lr_schedulers_fn(optimizers, job_config)
+    # Post optimizer step model converters hook.
+    # e.g. calculate float8 dynamic amax/scale for all-parameter for FSDP2
+    # where it issues a single all-reduce for all parameters at once for better performance
+    optimizers.register_step_post_hook(
+        lambda *args, **kwargs: model_converters.post_optimizer_hook(model_parts)
+    )
+    train_state = TrainState()
+    # load initial checkpoint
+    checkpoint = CheckpointManager(
+        dataloader=dataloader,
+        model_parts=model_parts,
+        optimizers=optimizers,
+        lr_schedulers=lr_schedulers,
+        states={"train_state": train_state},
+        job_config=job_config,
+        ft_manager=ft_manager,
+    )
+    if job_config.checkpoint.create_seed_checkpoint:
+        assert world_size == 1, (
+            "Must create seed checkpoint using a single device, to disable sharding"
+        )
+        assert job_config.checkpoint.enable_checkpoint, (
+            "Must enable checkpointing when creating a seed checkpoint"
+        )
+        checkpoint.save(curr_step=0, force=True)
+        logger.info("Created seed checkpoint")
+        return
+    checkpoint.load(step=job_config.checkpoint.load_step)
+    metric_logger = build_metrics_processor(job_config, parallel_dims)
+    # Set dependent attributes for metric_logger
+    metric_logger.num_flops_per_token = num_flops_per_token
+    metric_logger.optimizers = optimizers  # Pass optimizers if needed by logger logic
+    metric_logger.lr_schedulers = (
+        lr_schedulers  # Pass schedulers if needed by logger logic
+    )
+    # plot losses loaded from checkpoint (if any) to TensorBoard
+    # NOTE: Loss info after the last log step before checkpoint saving will not be ploted.
+    #       This can be avoided by setting checkpoint.interval to be a multiple of metrics.log_freq
+    if train_state.step > 0 and len(metric_logger.data_loading_times) > 0:
+        for idx, step in enumerate(train_state.log_steps):
+            metric_logger.log(
+                step,
+                global_avg_loss=train_state.global_avg_losses[idx],
+                global_max_loss=train_state.global_max_losses[idx],
+            )
+    data_iterator = iter(dataloader)
+    train_context = dist_utils.get_train_context(
+        parallel_dims.loss_parallel_enabled,
+        job_config.experimental.enable_compiled_autograd,
+    )
+    maybe_enable_amp = dist_utils.maybe_enable_amp(
+        parallel_dims,
+        job_config.training.mixed_precision_param,
+        device_type,
+    )
+    # variables used to keep info for metrics logging
+    device_memory_monitor.reset_peak_stats()
+    global_batch_size = (
+        job_config.training.batch_size
+        * dp_degree
+        * job_config.training.gradient_accumulation_steps
+    )
+    num_tokens_per_step = global_batch_size * job_config.training.seq_len
+    # train loop
+    logger.info(f"{color.red}***** Running training *****{color.reset}")
+    logger.info(f"{color.green}  Training starts at step {train_state.step + 1}")
+    logger.info(
+        f"{color.green}  Number of tokens per sequence = {job_config.training.seq_len:,}"
+    )
+    logger.info(
+        f"{color.green}  Gradient Accumulation steps = {job_config.training.gradient_accumulation_steps}"
+    )
+    logger.info(
+        f"{color.green}  Instantaneous batch size (per device) = {job_config.training.batch_size:,}"
+    )
+    logger.info(
+        f"{color.green}  Global batch size (w. parallel, distributed & accumulation) = {global_batch_size:,}"
+        f" ({num_tokens_per_step:,} tokens)"
+    )
+    logger.info(
+        f"{color.green}  Total optimization steps = {job_config.training.steps:,} "
+        f"({job_config.training.steps * num_tokens_per_step:,} tokens)"
+    )
+    logger.info(
+        f"{color.green}  Warmup steps = {job_config.lr_scheduler.warmup_steps:,}"
+        f" ({job_config.lr_scheduler.warmup_steps * num_tokens_per_step:,} tokens)"
+    )
+    logger.info(
+        f"{color.green}  Number of parameters = {model_param_count:,} {color.reset}"
+    )
+    with (
+        maybe_enable_profiling(
+            job_config, global_step=train_state.step
+        ) as torch_profiler,
+        maybe_enable_memory_snapshot(
+            job_config, global_step=train_state.step
+        ) as memory_profiler,
+    ):
+        while train_state.step < job_config.training.steps:
+            train_state.step += 1
+            gc_handler.run(train_state.step)
+            optimizers.zero_grad()
+            losses = []
+            # do gradient accumulation if enabled
+            for _ in range(job_config.training.gradient_accumulation_steps):
+                # get batch
+                data_load_start = time.perf_counter()
+                batch = next(data_iterator)
+                input_ids, labels = batch["input_ids"], batch["labels"]
+                # Update metrics processor state before forward/backward
+                metric_logger.ntokens_since_last_log += labels.numel()
+                metric_logger.data_loading_times.append(
+                    time.perf_counter() - data_load_start
+                )
+                input_ids = input_ids.to(device_type)
+                """
+                TODO[flame]: We need to carefully handle the position_ids for TP/CP
+                Depending on the Models'PE, the position_ids might be different.
+                e.g. for TP
+                    For RoPE, all ranks have the same position_ids. [FOR HF model]
+                    For sinusoidal, each rank has the coresponding chunked  position_ids. [FOR HF model]
+                e.g. for CP, [optional_context_parallel_ctx shoudl automatically distbute the position_ids]
+                    Each rank has the coresponding chunked position_ids. [FOR All model]
+                """
+                labels = labels.to(device_type)
+                cu_seqlens = (
+                    batch["cu_seqlens"].to(device_type)
+                    if "cu_seqlens" in batch
+                    else None
+                )
+                if cu_seqlens is not None:
+                    position_ids = prepare_position_ids(cu_seqlens).to(torch.int32)
+                else:
+                    position_ids = (
+                        torch.arange(0, input_ids.shape[1], device=device_type)
+                        .repeat(input_ids.shape[0], 1)
+                        .to(torch.int32)
+                    )
+                # apply context parallelism if cp is enabled
+                # ensure CP handles the separate freqs_cis buffer for each pp stage
+                optional_context_parallel_ctx = (
+                    dist_utils.create_context_parallel_ctx(
+                        cp_mesh=world_mesh["cp"],
+                        cp_buffers=[input_ids, labels, position_ids],
+                        cp_seq_dims=[1, 1, 1],
+                        cp_no_restore_buffers={input_ids, labels, position_ids},
+                        cp_rotate_method=job_config.experimental.context_parallel_rotate_method,
+                    )
+                    if parallel_dims.cp_enabled
+                    else None
+                )
+                # #! TODO[flame], we should distribute the position_ids as well with CP
+                if parallel_dims.pp_enabled:
+                    raise NotImplementedError(
+                        "Pipeline parallelism is not supported in this version"
+                    )
+                    # Pipeline Parallel forward / backward inside step() call
+                    with train_context(optional_context_parallel_ctx):
+                        targets, losses = (
+                            (labels, []) if has_last_stage else (None, None)
+                        )
+                        if has_first_stage:
+                            pp_schedule.step(input_ids, target=targets, losses=losses)
+                        else:
+                            pp_schedule.step(target=targets, losses=losses)
+                    # accumulate losses across pipeline microbatches
+                    # TODO: PP+FSDP unexpectedly puts the loss back to the CPU
+                    loss = (
+                        torch.mean(torch.stack(losses)).to(device)
+                        if has_last_stage
+                        else torch.tensor([-1.0], device=device)
+                    )
+                else:
+                    # Non-PP forward / backward
+                    with train_context(optional_context_parallel_ctx):
+                        with maybe_enable_amp:
+                            output = model(
+                                input_ids=input_ids,
+                                labels=labels,
+                                position_ids=position_ids,
+                                cu_seqlens=cu_seqlens,
+                        )
+                        loss = (
+                            output.loss
+                            / job_config.training.gradient_accumulation_steps
+                        )
+                        loss.backward()
+                losses.append(loss)
+            loss = sum(losses)
+            # clip gradients
+            grad_norm = dist_utils.clip_grad_norm_(
+                [p for m in model_parts for p in m.parameters()],
+                job_config.training.max_norm,
+                foreach=True,
+                pp_mesh=pp_mesh if parallel_dims.pp_enabled else None,
+            )
+            # optimizer step
+            checkpoint.maybe_wait_for_staging()
+            if job_config.training.skip_nan_inf and (
+                grad_norm.isnan() or grad_norm.isinf()
+            ):
+                logger.warning(
+                    f"Skipping optimizer step - detected invalid gradient norm: {grad_norm:.4f}"
+                )
+                optimizers.zero_grad()
+                train_state.skipped_step += 1
+            else:
+                optimizers.step()
+            lr_schedulers.step()
+            # log metrics - Use MetricsProcessor
+            if metric_logger.should_log(train_state.step):
+                if (
+                    parallel_dims.dp_replicate_enabled
+                    or parallel_dims.dp_shard_enabled
+                    or parallel_dims.cp_enabled
+                ):
+                    loss = loss.detach()
+                    # Use dist_mean/max on the accumulated loss for the step
+                    global_avg_loss, global_max_loss = (
+                        dist_utils.dist_mean(
+                            loss,
+                            world_mesh["dp_cp"],
+                        ),
+                        dist_utils.dist_max(
+                            loss,
+                            world_mesh["dp_cp"],
+                        ),
+                    )
+                else:
+                    # Scale back the loss before logging
+                    global_avg_loss = global_max_loss = loss.item()
+                # Update train state tokens and elapsed time
+                time_now = time.perf_counter()
+                time_delta = (
+                    time_now - metric_logger.time_last_log
+                )  # Use metric_logger's time
+                train_state.token += (
+                    metric_logger.ntokens_since_last_log  # Use tokens tracked by metric_logger
+                    * parallel_dims.world_size
+                    / parallel_dims.non_data_parallel_size
+                )
+                train_state.elapsed += timedelta(seconds=time_delta)
+                train_state.log_steps.append(train_state.step)
+                train_state.global_avg_losses.append(global_avg_loss)
+                train_state.global_max_losses.append(global_max_loss)
+                # Log using the metric processor
+                last_lr = lr_schedulers.schedulers[0].get_last_lr()[0]
+                eta = (
+                    train_state.elapsed
+                    * (job_config.training.steps - train_state.step)
+                    / train_state.step
+                )
+                metric_logger.log(
+                    train_state.step,
+                    global_avg_loss,
+                    global_max_loss,
+                    extra_metrics={
+                        "optimizer/lr": last_lr,
+                        "optimizer/grad_norm": grad_norm.item(),
+                        "optimizer/skipped_step": train_state.skipped_step,
+                    },
+                )
+                logger.info(
+                    f"{color.blue}lr: {last_lr:.4e} gnorm: {grad_norm:5.2f} "
+                    f"{color.magenta}[{str(train_state.elapsed).split('.')[0]:>8}<{str(eta).split('.')[0]:>8}]{color.reset}"
+                )
+            checkpoint.save(
+                train_state.step, force=(train_state.step == job_config.training.steps)
+            )
+            # signal the profiler that the next profiling step has started
+            if torch_profiler:
+                torch_profiler.step()
+            if memory_profiler:
+                memory_profiler.step()
+            # reduce timeout after first train step for faster signal
+            # (assuming lazy init and compilation are finished)
+            if train_state.step == 1:
+                dist_utils.set_pg_timeouts(
+                    timeout=timedelta(seconds=job_config.comm.train_timeout_seconds),
+                    world_mesh=world_mesh,
+                )
+    if torch.distributed.get_rank() == 0:
+        logger.info("Sleeping 2 seconds for other ranks to complete")
+        time.sleep(2)
+    metric_logger.close()
+    logger.info("Training completed")
+if __name__ == "__main__":
+    init_logger()
+    config = JobConfig()
+    config.parse_args()
+    main(config)
+    torch.distributed.destroy_process_group()

flame/utils/__init__.py ADDED Viewed

File without changes

flame/utils/convert_dcp_to_hf.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import argparse
+import io
+import os
+import tempfile
+from datetime import timedelta
+import fla  # noqa
+import fla.models.gsa
+import torch
+import torch.serialization
+from torch.distributed.checkpoint.format_utils import dcp_to_torch_save
+from torchtitan.tools.logging import init_logger, logger
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+import custom_models
+@torch.inference_mode()
+def save_pretrained(
+    path: str,
+    step: int,
+    config: str,
+    tokenizer: str
+):
+    logger.info(f"Loading the config from {config}")
+    config = AutoConfig.from_pretrained(config, trust_remote_code=True)
+    logger.info(f"Saving the config to {path}")
+    config.save_pretrained(path)
+    logger.info(f"Loading the tokenizer from {tokenizer}")
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer, trust_remote_code=True)
+    logger.info(f"Saving the tokenizer to {path}")
+    tokenizer.save_pretrained(path)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        checkpoint = os.path.join(path, f'checkpoint/step-{step}')
+        checkpoint_path = os.path.join(tmpdir, 'checkpoint.pt')
+        logger.info(f"Saving the distributed checkpoint to {checkpoint_path}")
+        dcp_to_torch_save(checkpoint, checkpoint_path)
+        logger.info(f"Initializing the model from config\n{config}")
+        model = AutoModelForCausalLM.from_config(config)
+        logger.info(model)
+        logger.info("Loading state dict from the checkpoint")
+        # Add datetime.timedelta and io.BytesIO to safe globals
+        torch.serialization.add_safe_globals([timedelta, io.BytesIO])
+        # torch.load now with default weights_only=True will work
+        model.load_state_dict(torch.load(checkpoint_path, map_location='cpu')['model'])
+        logger.info(f"Saving the model to {path}")
+        model.save_pretrained(path)
+if __name__ == "__main__":
+    init_logger()
+    parser = argparse.ArgumentParser("Convert DCP format model weights to huggingface-style.")
+    parser.add_argument("--path", type=str, required=True)
+    parser.add_argument("--step", type=int, required=True)
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--tokenizer", type=str, required=True)
+    args = parser.parse_args()
+    save_pretrained(args.path, args.step, args.config, args.tokenizer)

flame/utils/convert_hf_to_dcp.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import argparse
+from pathlib import Path
+import torch
+import torch.distributed.checkpoint as DCP
+from transformers import AutoModelForCausalLM
+import fla  # noqa
+import fla.models.gsa
+import fla.models.routmem
+from torchtitan.tools.logging import init_logger, logger
+@torch.inference_mode()
+def convert_hf_weights(model: str, checkpoint: str):
+    logger.info(f"Loading model from {model}")
+    model = AutoModelForCausalLM.from_pretrained(model)
+    state_dict = model.state_dict()
+    logger.info(f"Writing to DCP at '{checkpoint}'")
+    checkpoint.mkdir(parents=True, exist_ok=True)
+    storage_writer = DCP.filesystem.FileSystemWriter(checkpoint, thread_count=8)
+    DCP.save(state_dict, storage_writer=storage_writer)
+if __name__ == "__main__":
+    init_logger()
+    parser = argparse.ArgumentParser(description="Convert huggingface-style model weights to DCP format.")
+    parser.add_argument("--model", type=str, required=True)
+    parser.add_argument("--checkpoint", type=Path, required=True)
+    args = parser.parse_args()
+    convert_hf_weights(args.model, args.checkpoint)

flame/utils/preprocess.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+import argparse
+from typing import Any, Dict, List
+from transformers import AutoTokenizer, PreTrainedTokenizer
+from flame.data import build_dataset
+from torchtitan.tools.logging import init_logger, logger
+def tokenize(
+    examples: Dict[str, List[Any]],
+    tokenizer: PreTrainedTokenizer,
+) -> Dict:
+    if 'text' in examples:
+        samples = examples['text']
+    elif 'content' in examples:
+        samples = examples['content']
+    else:
+        raise ValueError(f'No "text" or "content" field found in examples:\n{examples}')
+    input_ids = tokenizer(samples)['input_ids']
+    bits_per_token = [len(sample.encode(encoding='utf-8')) * 8 / len(input_ids[i]) for i, sample in enumerate(samples)]
+    return {'input_ids': input_ids, 'bits_per_token': bits_per_token}
+if __name__ == '__main__':
+    init_logger()
+    parser = argparse.ArgumentParser(description='Preprocess the dataset.')
+    parser.add_argument(
+        '--dataset',
+        default='HuggingFaceFW/fineweb-edu',
+        help='Dataset to use, with comma separated values',
+    )
+    parser.add_argument(
+        '--dataset_name',
+        default='sample-100BT',
+        help='The name of the dataset config, with comma separated values if provided',
+    )
+    parser.add_argument(
+        '--dataset_split',
+        default='train',
+        help='Dataset split to use, with comma separated values if provided',
+    )
+    parser.add_argument(
+        '--data_dir',
+        default=None,
+        help='Data dirs to use, with comma separated values if provided',
+    )
+    parser.add_argument(
+        '--data_files',
+        default=None,
+        help='Data files to use, with comma separated values if provided',
+    )
+    parser.add_argument(
+        '--data_probs',
+        default=None,
+        help='Data sampling probabilities, with comma separated values if provided',
+    )
+    parser.add_argument(
+        '--streaming',
+        action='store_true',
+        help='Whether to use streaming mode',
+    )
+    parser.add_argument(
+        '--num_workers',
+        type=int,
+        default=64,
+        help='Number of workers to use for preprocessing',
+    )
+    parser.add_argument(
+        '--seed',
+        type=int,
+        default=42,
+        help='Random seed for preprocessing',
+    )
+    parser.add_argument(
+        '--path',
+        default='data',
+        help='Path to save the preprocessed dataset',
+    )
+    parser.add_argument(
+        '--tokenizer',
+        default='fla-hub/transformer-1.3B-100B',
+        help='Tokenizer to use',
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=2048,
+        help="Batch size for processing"
+    )
+    args = parser.parse_args()
+    logger.info(f'Loading tokenizer {args.tokenizer}')
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
+    logger.info(f'{tokenizer}')
+    logger.info(f'Loading dataset {args.dataset} {args.dataset_name} {args.dataset_split}')
+    dataset = build_dataset(
+        dataset=args.dataset,
+        dataset_name=args.dataset_name,
+        dataset_split=args.dataset_split,
+        data_dir=args.data_dir,
+        data_files=args.data_files,
+        data_probs=args.data_probs,
+        streaming=args.streaming,
+        num_workers=args.num_workers,
+        seed=args.seed,
+    )
+    logger.info(f'Tokenizing and processing the dataset with batch size {args.batch_size}')
+    dataset = dataset.map(
+        lambda examples: tokenize(examples, tokenizer),
+        batched=True,
+        batch_size=args.batch_size,
+        remove_columns=list(next(iter(dataset)).keys()),
+        num_proc=args.num_workers,
+        desc="Running tokenizer on dataset"
+    )
+    logger.info(f'{dataset}')
+    logger.info(f'Saving tokenized dataset to {args.path}')
+    dataset.save_to_disk(args.path)

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "transformers_version": "4.57.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,410 @@

+{
+  "metadata": {
+    "total_parameters": 14409815040,
+    "total_size": 28819630080
+  },
+  "weight_map": {
+    "model.embeddings.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.attn.f_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.attn_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.0.mlp_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.attn.f_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.attn_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.1.mlp_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.10.attn.f_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.attn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.10.mlp_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.attn.f_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.attn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.11.mlp_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.attn.f_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.attn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.12.mlp_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.13.attn.f_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.attn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.13.mlp_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.attn.f_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.attn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.14.mlp_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.attn.f_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.attn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.15.mlp_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.attn.f_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.attn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.16.mlp_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.attn.f_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.attn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.17.mlp_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.attn.f_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.attn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.18.mlp_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.attn.f_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.attn.k_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.attn.o_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.attn.q_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.attn.v_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.attn_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.19.mlp_norm.weight": "model-00003-of-00006.safetensors",
+    "model.layers.2.attn.f_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.attn_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.2.mlp_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.20.attn.f_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.attn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.20.mlp_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.attn.f_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.attn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.21.mlp_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.attn.f_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.attn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.22.mlp_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.23.attn.f_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.23.attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.23.attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.23.attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.23.attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.23.attn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.23.mlp_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.attn.f_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.attn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.24.mlp_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.attn.f_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.attn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.25.mlp_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.attn.f_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.attn.k_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.attn.o_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.attn.q_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.attn.v_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.attn_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
+    "model.layers.26.mlp_norm.weight": "model-00004-of-00006.safetensors",
+    "model.layers.27.attn.f_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.attn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.27.mlp_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.attn.f_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.attn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.28.mlp_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.29.attn.f_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.29.attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.29.attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.29.attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.29.attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.29.attn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.29.mlp_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.3.attn.f_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.attn_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.3.mlp_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.30.attn.f_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.30.attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.30.attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.30.attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.30.attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.30.attn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.30.mlp_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.31.attn.f_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.31.attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.31.attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.31.attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.31.attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.31.attn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.31.mlp_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.32.attn.f_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.32.attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.32.attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.32.attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.32.attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.32.attn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.32.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.32.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.32.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.32.mlp_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.attn.f_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.attn.k_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.attn.o_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.attn.q_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.attn.v_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.attn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
+    "model.layers.33.mlp_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.34.attn.f_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.34.attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.34.attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.34.attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.34.attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.34.attn_norm.weight": "model-00005-of-00006.safetensors",
+    "model.layers.34.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.34.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.34.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.34.mlp_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.35.attn.f_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.35.attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.35.attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.35.attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.35.attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.35.attn_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.35.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.35.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.35.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.35.mlp_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.36.attn.f_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.36.attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.36.attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.36.attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.36.attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.36.attn_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.36.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.36.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.36.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.36.mlp_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.37.attn.f_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.37.attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.37.attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.37.attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.37.attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.37.attn_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.37.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.37.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.37.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.37.mlp_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.38.attn.f_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.38.attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.38.attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.38.attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.38.attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.38.attn_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.38.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.38.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.38.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.38.mlp_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.39.attn.f_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.39.attn.k_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.39.attn.o_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.39.attn.q_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.39.attn.v_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.39.attn_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.39.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.39.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.39.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
+    "model.layers.39.mlp_norm.weight": "model-00006-of-00006.safetensors",
+    "model.layers.4.attn.f_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.attn.k_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.attn.o_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.attn.q_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.attn.v_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.attn_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
+    "model.layers.4.mlp_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.5.attn.f_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.attn_norm.weight": "model-00001-of-00006.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.5.mlp_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.attn.f_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.attn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.6.mlp_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.attn.f_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.attn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.7.mlp_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.attn.f_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.attn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.8.mlp_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.attn.f_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.attn.k_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.attn.o_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.attn.q_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.attn.v_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.attn_norm.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
+    "model.layers.9.mlp_norm.weight": "model-00002-of-00006.safetensors",
+    "model.norm.weight": "model-00006-of-00006.safetensors"
+  }
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,55 @@

+[project]
+name = "flame"
+dynamic = ["version"]
+description = "A minimal training framework for scaling FLA models"
+readme = "README.md"
+authors = [
+    { name = "Songlin Yang", email = "yangsl66@mit.edu" },
+    { name = "Yu Zhang", email = "yzhang.cs@outlook.com" },
+]
+license = { file = "LICENSE" }
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+requires-python = ">=3.10"
+dependencies = [
+    'flash-linear-attention',
+    'torch>=2.5',
+    'torchdata',
+    'transformers>=4.45.0',
+    'triton>=3.0',
+    'datasets>=3.3.0',
+    'einops',
+    'ninja',
+    'wandb',
+    'tiktoken',
+    'tensorboard',
+    "tyro>=1.0.3",
+    "torchtitan",
+    "psutil>=7.2.1",
+    "cmake>=4.2.1",
+    "packaging>=25.0",
+    "setuptools>=80.9.0",
+    "wheel>=0.45.1",
+    "flash-attn>=2.8.3",
+    "ipython>=8.37.0",
+]
+[project.optional-dependencies]
+dev = ["pytest"]
+[project.urls]
+Homepage = "https://github.com/fla-org/flame"
+[build-system]
+requires = ["setuptools>=45", "wheel", "ninja", "torch"]
+[tool.isort]
+line_length = 127
+multi_line_output = 3
+[tool.uv.sources]
+torchtitan = { git = "https://github.com/pytorch/torchtitan.git", rev = "0b44d4c" }

setup.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# -*- coding: utf-8 -*-
+import ast
+import os
+import re
+from pathlib import Path
+from setuptools import find_packages, setup
+with open('README.md') as f:
+    long_description = f.read()
+def get_package_version():
+    with open(Path(os.path.dirname(os.path.abspath(__file__))) / 'flame' / '__init__.py') as f:
+        version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
+    return ast.literal_eval(version_match.group(1))
+setup(
+    name='flame',
+    version=get_package_version(),
+    description='A minimal training framework for scaling FLA models',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    author='Songlin Yang, Yu Zhang',
+    author_email='yangsl66@mit.edu, yzhang.cs@outlook.com',
+    url='https://github.com/fla-org/flame',
+    packages=find_packages(),
+    license='MIT',
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: OS Independent',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence'
+    ],
+    python_requires='>=3.10',
+    install_requires=[
+        'flash-linear-attention',
+        'torch>=2.5',
+        'torchdata',
+        'transformers>=4.45.0',
+        'triton>=3.0',
+        'datasets>=3.3.0',
+        'einops',
+        'ninja',
+        'wandb',
+        'tiktoken',
+        'tensorboard',
+    ],
+)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

train.sh ADDED Viewed

	@@ -0,0 +1,122 @@

+#!/usr/bin/bash
+params=""
+if [ $# -ne 0 ]; then
+    params="$*"
+fi
+# use envs as local params for convenience
+# e.g.
+# NNODE=1 NGPU=8 LOG_RANK=0 ./train.sh
+NNODE=${NNODE:-"1"}
+NGPU=${NGPU:-"8"}
+LOG_RANK=${LOG_RANK:-0}
+if [[ -z "${MASTER_ADDR}" ]]; then
+  export MASTER_ADDR="localhost"
+fi
+if [[ -z "${MASTER_PORT}" ]]; then
+  export MASTER_PORT="0"
+fi
+: '
+Usage:
+bash train.sh -h
+Training a 340M model:
+NNODE=1 NGPU=8 LOG_RANK=0 bash train.sh \
+  --job.config_file flame/models/fla.toml \
+  --job.dump_folder exp/transformer-340M-10B/batch32.seqlen2048.warmup1024.update1.steps20480.lr3e-4 \
+  --model.config configs/transformer_340M.json \
+  --model.tokenizer_path fla-hub/transformer-1.3B-100B \
+  --optimizer.name AdamW \
+  --optimizer.eps 1e-15 \
+  --optimizer.lr 3e-4 \
+  --lr_scheduler.warmup_steps 1024 \
+  --lr_scheduler.lr_min 0.1 \
+  --lr_scheduler.decay_type cosine \
+  --training.batch_size 32 \
+  --training.seq_len 2048 \
+  --training.gradient_accumulation_steps 1 \
+  --training.steps 20480 \
+  --training.max_norm 1.0 \
+  --training.skip_nan_inf \
+  --training.dataset HuggingFaceFW/fineweb-edu \
+  --training.dataset_name default \
+  --training.dataset_split train \
+  --training.streaming \
+  --training.num_workers 32 \
+  --training.prefetch_factor 2 \
+  --training.seed 42 \
+  --training.compile \
+  --training.tensor_parallel_degree 1 \
+  --training.disable_loss_parallel \
+  --checkpoint.interval 2048 \
+  --checkpoint.load_step -1 \
+  --metrics.log_freq 1
+'
+echo "Launching training..."
+set -x
+path=$(grep -oP '(?<=--job.dump_folder )[^ ]+' <<< "$params")
+steps=$(grep -oP '(?<=--training.steps )[^ ]+' <<< "$params")
+config=$(grep -oP '(?<=--model.config )[^ ]+' <<< "$params")
+tokenizer=$(grep -oP '(?<=--model.tokenizer_path )[^ ]+' <<< "$params")
+echo "Using Python at: $(which python)"
+model=$(
+  python -c "import fla, sys; import fla.models.gsa; import fla.models.routmem; from transformers import AutoConfig; print(AutoConfig.from_pretrained(sys.argv[1]).to_json_string())" "$config" | jq -r '.model_type'
+)
+mkdir -p $path
+cp * $path
+cp -r configs $path
+cp -r flame   $path
+cp -r 3rdparty/flash-linear-attention/fla $path
+cp -r 3rdparty/torchtitan/torchtitan $path
+# for offline systems
+# export TRANSFORMERS_OFFLINE=1
+# export HF_DATASETS_OFFLINE=1
+# export HF_HUB_OFFLINE=1
+if [ "$date" == "" ]; then
+  date=$(date +%Y%m%d%H%M)
+fi
+RUN_NAME="$model-$(basename $path)"
+RUN_ID="$RUN_NAME-$date"
+export WANDB_RESUME=allow
+if [[ -z "${WANDB_PROJECT}" ]]; then
+  export WANDB_PROJECT="fla"
+fi
+if [[ -z "${WANDB_NAME}" ]]; then
+  export WANDB_NAME="$RUN_NAME"
+fi
+if [[ -z "${WANDB_RUN_ID}" ]]; then
+  export WANDB_RUN_ID="$RUN_ID"
+fi
+PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \
+torchrun --nnodes=${NNODE} \
+  --nproc_per_node=${NGPU} \
+  --rdzv_backend c10d \
+  --rdzv_endpoint "${MASTER_ADDR}:${MASTER_PORT}" \
+  --local-ranks-filter ${LOG_RANK} \
+  --role rank \
+  --tee 3 \
+  --log-dir $path/logs \
+  -m flame.train \
+  $params
+echo "TRAINING DONE!"
+echo "Converting the DCP checkpoints to HF format..."
+python -m flame.utils.convert_dcp_to_hf \
+  --path $path \
+  --step $steps \
+  --config $config \
+  --tokenizer $tokenizer
+echo "RUNNING DONE!"

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff