File size: 12,905 Bytes

b386992

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List, Tuple

import lightning.pytorch as pl
import numpy as np
import torch
from lightning.pytorch.callbacks import Callback

from nemo.collections.llm.gpt.model.base import GPTConfig
from nemo.lightning.pytorch.callbacks import PEFT
from nemo.utils import flops_formulas, logging
from nemo.utils.hyena_flops_formulas import hyena


__all__ = ["FLOPsMeasurementCallback", "MM_FLOPsMeasurementCallback"]

_model_flops_map = {
    "gpt3": flops_formulas.gpt3,
    "llama2": flops_formulas.llama2,
    "llama3": flops_formulas.llama3,
    "llama4": flops_formulas.llama3,  # TODO: add llama4 flops formulas
    "nemotron3": flops_formulas.nemotron,
    "nemotron4": flops_formulas.nemotron,
    "mixtral": flops_formulas.mixtral,
    "bert": flops_formulas.bert,
    "hyena": hyena,
    "deepseekv3": flops_formulas.deepseekv3,
    "transformer": flops_formulas.transformer,
    "qwen3": flops_formulas.qwen3,
    "nemotronh": flops_formulas.nemotronh,
}


class FLOPsMeasurementCallback(Callback):
    """
    Calculate and log FLOPs per second after every ``trainer.log_every_n_steps`` steps.

    Args:
        model_config (GPTConfig): Model parameters.
        data_config (pl.LightningDataModule): Data module being used in the experiment.
        model_name (str): Name of the model being run. The following models are supported:
            gpt3, llama2, llama3, nemotron, mixtral, bert, hyena.


    """

    higher_is_better = True

    def __init__(
        self,
        model_config: GPTConfig,
        data_config: pl.LightningDataModule,
        model_name: str,
    ):
        self.model_cfg = model_config
        self.data_cfg = data_config

        # use config params only when NOT provided explicitly
        self.model = model_name

        gbs = self.data_cfg.global_batch_size
        enc_seq_len = self.model_cfg.seq_length
        hs = self.model_cfg.hidden_size
        layers = self.model_cfg.num_layers
        ffn_hs = self.model_cfg.ffn_hidden_size
        attention_heads = self.model_cfg.num_attention_heads
        moe_router_topk = self.model_cfg.moe_router_topk
        model_pattern = getattr(self.model_cfg, "hybrid_override_pattern", None)
        vocab_size = self.data_cfg.tokenizer.vocab_size if hasattr(self.data_cfg, "tokenizer") else None

        # this handles both- 1. key is present, value is None; 2. key is absent
        query_groups = self.model_cfg.num_query_groups
        if query_groups is None:
            query_groups = attention_heads

        config_kwargs = {
            "gbs": gbs,
            "enc_seq_len": enc_seq_len,
            "hs": hs,
            "layers": layers,
            "ffn_hs": ffn_hs,
            "attention_heads": attention_heads,
            "moe_router_topk": moe_router_topk,
            "query_groups": query_groups,
            "vocab_size": vocab_size,
            "model_pattern": model_pattern,
        }

        from megatron.core.transformer.transformer_config import MLATransformerConfig

        if isinstance(self.model_cfg, MLATransformerConfig):
            config_kwargs["qk_head_dim"] = self.model_cfg.qk_head_dim
            config_kwargs["qk_pos_emb_head_dim"] = self.model_cfg.qk_pos_emb_head_dim
            config_kwargs["v_head_dim"] = self.model_cfg.v_head_dim
            config_kwargs["q_lora_rank"] = self.model_cfg.q_lora_rank
            config_kwargs["kv_lora_rank"] = self.model_cfg.kv_lora_rank
        config_kwargs["moe_layer_freq"] = self.model_cfg.moe_layer_freq
        config_kwargs["moe_shared_expert_intermediate_size"] = self.model_cfg.moe_shared_expert_intermediate_size
        config_kwargs["moe_ffn_hidden_size"] = self.model_cfg.moe_ffn_hidden_size
        config_kwargs["mtp_num_layers"] = self.model_cfg.mtp_num_layers

        if self.model_cfg.is_hybrid_model:
            config_kwargs['is_hybrid_model'] = True
            config_kwargs['hybrid_override_pattern'] = self.model_cfg.hybrid_override_pattern
            config_kwargs['mamba_state_dim'] = self.model_cfg.mamba_state_dim
            config_kwargs['mamba_head_dim'] = self.model_cfg.mamba_head_dim
            config_kwargs['mamba_num_groups'] = self.model_cfg.mamba_num_groups
            config_kwargs['mamba_num_heads'] = self.model_cfg.mamba_num_heads

        self.flops_config = flops_formulas.FLOPSConfig(**config_kwargs)

        self.model = self.model.lower() if self.model is not None else self.model

        self.avg_train_step_time = 0

    def on_train_start(self, trainer, pl_module):
        """
        PyTorch Lightning callback hook. Ensures that user is not using PEFT
        as FLOPS callback does not support it.
        """
        for callback in trainer.callbacks:
            if isinstance(callback, PEFT):
                raise NotImplementedError("FLOPs measurement not supported for finetuning jobs")

    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int):
        """
        PyTorch Lightning callback hook to calculate TFLOPs per sec per GPU after training
        """
        try:
            self.avg_train_step_time += trainer.progress_bar_metrics['train_step_timing in s']
        except KeyError:
            print("'train_step_timing in s' not found. Make sure to use TimingCallback with FLOPsMeasurementCallback.")
        n = trainer.strategy.current_epoch_step
        if n % trainer.log_every_n_steps == 0:
            # skip calculation if we haven't accumulated any timing data
            if self.avg_train_step_time == 0:
                return
            train_step_time = self.avg_train_step_time / trainer.log_every_n_steps
            tflops_per_gpu, flops = self.eval_tflops_per_sec_per_gpu(train_step_time)
            self.avg_train_step_time = 0
            pl_module.log(
                "TFLOPS_per_GPU",
                tflops_per_gpu,
                on_step=True,
                on_epoch=False,
                batch_size=1,
                prog_bar=True,
            )

            tflops = flops / (1e12 * train_step_time)
            pl_module.log(
                "TFLOPS",
                tflops,
            )

    def eval_tflops_per_sec_per_gpu(self, train_step_time: List | float | int) -> float:
        """
        Args:
            train_step_time (Any[List, float, int]): Train step time (in seconds).
            Step time will be less stable for initial steps (~10 steps)- less
            accurate measurement
            Use average step time over several steps for higher accuracy
        Returns:
            (float): Model TFLOPs per sec per gpu
        """
        total_flops, flops_per_gpu = self.eval_model_flops()

        if not isinstance(train_step_time, list):
            train_step_time = [train_step_time]
        # efficient mean computation if num train steps is very large
        step_time_arr = np.array(train_step_time)
        train_step_time = np.mean(step_time_arr[len(step_time_arr) // 2 :])

        flops_per_sec_per_gpu = flops_per_gpu / (1e12 * train_step_time)

        return flops_per_sec_per_gpu, total_flops

    def eval_model_flops(self) -> Tuple[float, float]:
        """
        Calculate model FLOPs for a given model
        """
        if self.model is not None:
            model_matches = [model for model in _model_flops_map if model in self.model]
            self.model = model_matches[0] if len(model_matches) > 0 else self.model
        if self.model not in _model_flops_map:
            logging.info(f"FLOPs measurement supported for {list(_model_flops_map.keys())}")
            raise KeyError(f"Failed to extract valid model name from or missing FLOPs calculations for {self.model}")

        total_flops = _model_flops_map[self.model](self.flops_config)
        num_devices = torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
        flops_per_gpu = total_flops / num_devices

        return total_flops, flops_per_gpu


class MM_FLOPsMeasurementCallback(FLOPsMeasurementCallback):
    """
    Calculate and log FLOPs per second after every ``trainer.log_every_n_steps`` steps for multi-modal models.
    The following models are supported:
            hf_clip_vit_l, neva_projection, gpt3, llama2, llama3, nemotron, mixtral, bert, hyena

    Args:
        model_name_config_dict (dict):
            Dictionary containing all the individual model configs that make up the multi-modal model.
        data_config (pl.LightningDataModule): Data module being used in the experiment.
    """

    higher_is_better = True

    def __init__(
        self,
        model_name_config_dict: dict,
        data_config: pl.LightningDataModule,
    ):
        self.data_cfg = data_config
        self.flops_config_dict = dict()

        for model_name, model_cfg in model_name_config_dict.items():
            kwargs = dict()
            kwargs["gbs"] = self.data_cfg.global_batch_size
            kwargs["hs"] = model_cfg.hidden_size
            if model_name in ["hf_clip_vit_l"]:
                kwargs["layers"] = model_cfg.num_hidden_layers
                kwargs["img_seq_len"] = model_cfg.num_image_embeddings_per_tile
                kwargs["img_h"] = model_cfg.image_size
                kwargs["img_w"] = model_cfg.image_size
                kwargs["patch_dim"] = model_cfg.patch_size
                kwargs["in_channels"] = model_cfg.num_channels
                kwargs["class_token_len"] = 1  # TODO: Add directly to HFCLIPVisionConfig
            elif model_name in ["neva_projection"]:
                kwargs["projector_type"] = model_cfg.projector_type
                kwargs["ffn_hs"] = model_cfg.ffn_hidden_size
                kwargs["inp_s"] = model_cfg.input_size
                # TODO: Add img_seq_len directly to MultimodalProjectorConfig
                kwargs["img_seq_len"] = model_name_config_dict["hf_clip_vit_l"].num_image_embeddings_per_tile
            elif model_name in ["flux"]:
                kwargs["layers"] = [model_cfg.num_joint_layers, model_cfg.num_single_layers]
                kwargs["hs"] = model_cfg.hidden_size
                kwargs["model_channels"] = model_cfg.model_channels
                kwargs["inp_s"] = model_cfg.context_dim
                kwargs["in_channels"] = model_cfg.in_channels
                kwargs["vec_in_dim"] = model_cfg.vec_in_dim
            else:
                kwargs["enc_seq_len"] = model_cfg.seq_length
                kwargs["layers"] = model_cfg.num_layers
                kwargs["ffn_hs"] = model_cfg.ffn_hidden_size
                kwargs["attention_heads"] = model_cfg.num_attention_heads
                kwargs["moe_router_topk"] = model_cfg.moe_router_topk

            try:
                query_groups = model_cfg.num_query_groups
                if query_groups is None:
                    query_groups = model_cfg.num_attention_heads
                kwargs["query_groups"] = query_groups
            except:
                # Multi-modal models use HF model configs which may/may not define num_query_groups
                pass

            self.flops_config_dict[model_name] = flops_formulas.FLOPSConfig(**kwargs)

        self.avg_train_step_time = 0

    def eval_model_flops(self):
        """
        Calculate model FLOPs for a given model recursively when model has multiple sub-models
        """

        # Add Multimodal models supported only by MM_FLOPsMeasurementCallback
        mm_model_flops_map = {
            **_model_flops_map,
            "hf_clip_vit_l": flops_formulas.clip_vit_l,
            "neva_projection": flops_formulas.neva_projection,
            "flux": flops_formulas.flux,
        }

        total_flops = flops_per_gpu = 0
        for model_name, flops_cfg in self.flops_config_dict.items():
            if model_name not in mm_model_flops_map:
                logging.info(f"FLOPs measurement supported for {list(mm_model_flops_map.keys())}")
                raise KeyError(
                    f"Failed to extract valid model name from or missing FLOPs calculations for {model_name}"
                )
            total_flops += mm_model_flops_map[model_name](flops_cfg)
        num_devices = torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
        flops_per_gpu = total_flops / num_devices

        return total_flops, flops_per_gpu