|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from dataclasses import dataclass |
|
|
from typing import List, Optional, Union |
|
|
|
|
|
from nemo.collections.common.parts.perf_metrics_utils import LLM_VOCAB_SIZE_MAP |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class FLOPSConfig: |
|
|
"""Contains the model hparams needed for FLOPS computations""" |
|
|
|
|
|
gbs: int |
|
|
enc_seq_len: Optional[int] = None |
|
|
hs: Optional[int] = None |
|
|
layers: Optional[int] = None |
|
|
ffn_hs: Optional[int] = None |
|
|
attention_heads: Optional[int] = None |
|
|
moe_router_topk: Optional[int] = None |
|
|
query_groups: Optional[int] = None |
|
|
img_seq_len: Optional[int] = None |
|
|
img_h: Optional[int] = None |
|
|
img_w: Optional[int] = None |
|
|
in_channels: Optional[int] = None |
|
|
patch_dim: Optional[int] = None |
|
|
class_token_len: Optional[int] = None |
|
|
projector_type: Optional[str] = None |
|
|
inp_s: Optional[int] = None |
|
|
model_pattern: Optional[str] = None |
|
|
vocab_size: Optional[int] = None |
|
|
model_channels: Optional[int] = None |
|
|
vec_in_dim: Optional[int] = None |
|
|
q_lora_rank: Optional[int] = None |
|
|
kv_lora_rank: Optional[int] = None |
|
|
qk_head_dim: Optional[int] = None |
|
|
qk_pos_emb_head_dim: Optional[int] = None |
|
|
v_head_dim: Optional[int] = None |
|
|
moe_layer_freq: Union[int, List[int]] = None |
|
|
moe_shared_expert_intermediate_size: Optional[int] = None |
|
|
moe_ffn_hidden_size: Optional[int] = None |
|
|
mtp_num_layers: Optional[int] = None |
|
|
causal_self_attn: Optional[bool] = None |
|
|
is_hybrid_model: bool = False |
|
|
hybrid_override_pattern: Optional[str] = None |
|
|
mamba_state_dim: Optional[int] = None |
|
|
mamba_head_dim: Optional[int] = None |
|
|
mamba_num_groups: Optional[int] = None |
|
|
mamba_num_heads: Optional[int] = None |
|
|
|
|
|
|
|
|
def gpt3(config: FLOPSConfig): |
|
|
"""Model FLOPs for GPT3 family""" |
|
|
|
|
|
vocab_size = LLM_VOCAB_SIZE_MAP["gpt3"] |
|
|
|
|
|
return ( |
|
|
24 * config.gbs * config.enc_seq_len * config.hs * config.hs |
|
|
+ 4 * config.gbs * config.enc_seq_len * config.enc_seq_len * config.hs |
|
|
) * (3 * config.layers) + (6 * config.gbs * config.enc_seq_len * config.hs * vocab_size) |
|
|
|
|
|
|
|
|
def llama2(config: FLOPSConfig): |
|
|
"""Model FLOPs for llama2 family""" |
|
|
vocab_size = LLM_VOCAB_SIZE_MAP["llama2"] |
|
|
|
|
|
return ( |
|
|
config.gbs |
|
|
* config.enc_seq_len |
|
|
* config.layers |
|
|
* config.hs |
|
|
* config.hs |
|
|
* ( |
|
|
12 |
|
|
+ (12 * config.query_groups / config.attention_heads) |
|
|
+ (18 * config.ffn_hs / config.hs) |
|
|
+ (12 * config.enc_seq_len / config.hs) |
|
|
+ (6 * vocab_size / (config.layers * config.hs)) |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
def llama3(config: FLOPSConfig): |
|
|
"""Model FLOPs for llama3 family""" |
|
|
vocab_size = LLM_VOCAB_SIZE_MAP["llama3"] |
|
|
|
|
|
return ( |
|
|
config.gbs |
|
|
* config.enc_seq_len |
|
|
* config.layers |
|
|
* config.hs |
|
|
* config.hs |
|
|
* ( |
|
|
12 |
|
|
+ (12 * config.query_groups / config.attention_heads) |
|
|
+ (18 * config.ffn_hs / config.hs) |
|
|
+ (12 * config.enc_seq_len / config.hs) |
|
|
+ (6 * vocab_size / (config.layers * config.hs)) |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
def nemotron(config: FLOPSConfig): |
|
|
"""Model FLOPs for nemotron family""" |
|
|
vocab_size = LLM_VOCAB_SIZE_MAP["nemotron"] |
|
|
|
|
|
return ( |
|
|
config.gbs |
|
|
* config.enc_seq_len |
|
|
* config.layers |
|
|
* config.hs |
|
|
* config.hs |
|
|
* ( |
|
|
12 |
|
|
+ (12 * config.query_groups / config.attention_heads) |
|
|
+ (12 * config.ffn_hs / config.hs) |
|
|
+ (12 * config.enc_seq_len / config.hs) |
|
|
+ (6 * vocab_size / (config.layers * config.hs)) |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
def mixtral(config: FLOPSConfig): |
|
|
"""Model FLOPs for mixtral family""" |
|
|
vocab_size = LLM_VOCAB_SIZE_MAP["mixtral"] |
|
|
|
|
|
return ( |
|
|
config.gbs |
|
|
* config.enc_seq_len |
|
|
* config.layers |
|
|
* config.hs |
|
|
* config.hs |
|
|
* ( |
|
|
12 |
|
|
+ (12 * config.query_groups / config.attention_heads) |
|
|
+ (18 * config.moe_router_topk * config.ffn_hs / config.hs) |
|
|
+ (12 * config.enc_seq_len / config.hs) |
|
|
+ (6 * vocab_size / (config.layers * config.hs)) |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
def qwen3(config: FLOPSConfig): |
|
|
"""Model FLOPs for Qwen3 family""" |
|
|
causal_self_attn = True |
|
|
seq_len = config.enc_seq_len |
|
|
hidden_size = config.hs |
|
|
gated_linear_multiplier = 2 |
|
|
|
|
|
|
|
|
attention_flops = ( |
|
|
3 |
|
|
* 2 |
|
|
* config.gbs |
|
|
* config.layers |
|
|
* seq_len |
|
|
* hidden_size |
|
|
* hidden_size |
|
|
* ( |
|
|
(config.query_groups / config.attention_heads * 2 + 1) |
|
|
+ (seq_len / hidden_size * 2 * (0.5 if causal_self_attn else 1)) |
|
|
+ 1 |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
mlp_flops = ( |
|
|
3 |
|
|
* 2 |
|
|
* config.gbs |
|
|
* config.layers |
|
|
* seq_len |
|
|
* hidden_size |
|
|
* (1 + gated_linear_multiplier) |
|
|
* (config.moe_ffn_hidden_size * config.moe_router_topk) |
|
|
) |
|
|
|
|
|
|
|
|
vocab_flops = 3 * 2 * config.gbs * seq_len * hidden_size * config.vocab_size |
|
|
|
|
|
return attention_flops + mlp_flops + vocab_flops |
|
|
|
|
|
|
|
|
def bert(config: FLOPSConfig): |
|
|
"""Model FLOPs for BERT family""" |
|
|
vocab_size = LLM_VOCAB_SIZE_MAP["bert"] |
|
|
|
|
|
return ( |
|
|
72 |
|
|
* config.gbs |
|
|
* config.layers |
|
|
* config.enc_seq_len |
|
|
* config.hs |
|
|
* config.hs |
|
|
* (1 + (config.enc_seq_len / (6 * config.hs)) + (vocab_size / (12 * config.hs * config.layers))) |
|
|
) |
|
|
|
|
|
|
|
|
def transformer(config: FLOPSConfig): |
|
|
"""Calculate FLOPs for a standard Transformer model. |
|
|
Note: This does not cover encoder-decoder models. |
|
|
""" |
|
|
|
|
|
batch_size = config.gbs |
|
|
hidden_size = config.hs |
|
|
seq_length = config.enc_seq_len |
|
|
num_layers = config.layers |
|
|
num_attention_heads = config.attention_heads |
|
|
ffn_hidden_size = config.ffn_hs |
|
|
vocab_size = config.vocab_size |
|
|
|
|
|
if vocab_size is None: |
|
|
raise ValueError("vocab_size is required for transformer FLOPs calculation") |
|
|
|
|
|
|
|
|
query_groups = config.query_groups if config.query_groups is not None else num_attention_heads |
|
|
causal_self_attn = config.causal_self_attn if config.causal_self_attn is not None else False |
|
|
moe_router_topk = config.moe_router_topk if config.moe_router_topk is not None else 0 |
|
|
kv_channels = hidden_size // num_attention_heads |
|
|
|
|
|
|
|
|
query_projection_size = kv_channels * num_attention_heads |
|
|
query_projection_to_hidden_size_ratio = query_projection_size / hidden_size |
|
|
|
|
|
|
|
|
|
|
|
if moe_router_topk == 0: |
|
|
num_dense_layers = num_layers |
|
|
num_moe_layers = 0 |
|
|
num_experts_routed_to = 0 |
|
|
else: |
|
|
|
|
|
|
|
|
num_moe_layers = num_layers // 2 |
|
|
num_dense_layers = num_layers - num_moe_layers |
|
|
num_experts_routed_to = moe_router_topk |
|
|
|
|
|
|
|
|
|
|
|
gated_linear_multiplier = 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
expansion_factor = 3 * 2 * 2 |
|
|
|
|
|
if not causal_self_attn: |
|
|
attention_component = ( |
|
|
1 |
|
|
+ (query_groups / num_attention_heads) |
|
|
|
|
|
+ (seq_length / hidden_size) |
|
|
) * query_projection_to_hidden_size_ratio |
|
|
else: |
|
|
attention_component = ( |
|
|
1 |
|
|
+ (query_groups / num_attention_heads) |
|
|
|
|
|
+ (seq_length / hidden_size / 2) |
|
|
) * query_projection_to_hidden_size_ratio |
|
|
|
|
|
|
|
|
total_flops = ( |
|
|
expansion_factor |
|
|
* batch_size |
|
|
* seq_length |
|
|
* num_layers |
|
|
* hidden_size |
|
|
* hidden_size |
|
|
* ( |
|
|
attention_component |
|
|
|
|
|
+ ( |
|
|
( |
|
|
|
|
|
(ffn_hidden_size * num_dense_layers) |
|
|
+ |
|
|
|
|
|
( |
|
|
( |
|
|
|
|
|
ffn_hidden_size |
|
|
* num_experts_routed_to |
|
|
|
|
|
) |
|
|
* num_moe_layers |
|
|
) |
|
|
) |
|
|
* gated_linear_multiplier |
|
|
/ (num_layers * hidden_size) |
|
|
) |
|
|
|
|
|
+ (vocab_size / (2 * num_layers * hidden_size)) |
|
|
) |
|
|
) |
|
|
|
|
|
return total_flops |
|
|
|
|
|
|
|
|
def clip_vit_l(config: FLOPSConfig): |
|
|
"""Model FLOPs for CLIP ViT""" |
|
|
|
|
|
if config.img_seq_len is None: |
|
|
config.img_seq_len = (config.img_h * config.img_w) / ( |
|
|
config.patch_dim * config.patch_dim |
|
|
) + config.class_token_len |
|
|
return config.gbs * config.layers * config.hs * config.hs * config.img_seq_len * ( |
|
|
24 + (4 * config.img_seq_len / config.hs) |
|
|
) + (2 * config.gbs * config.hs * config.in_channels * config.img_h * config.img_w) |
|
|
|
|
|
|
|
|
def neva_projection(config: FLOPSConfig): |
|
|
"""Model FLOPs for NeVA Projection""" |
|
|
|
|
|
if "mlp" in config.projector_type: |
|
|
return 6 * config.gbs * config.img_seq_len * config.ffn_hs * (config.inp_s + config.hs) |
|
|
elif config.projector_type == "affine": |
|
|
return 6 * config.gbs * config.img_seq_len * config.inp_s * config.hs |
|
|
else: |
|
|
raise ValueError( |
|
|
f"NeVA Projections FLOPs calculator only supports 'mlp', 'mcore_mlp'" |
|
|
f" or 'affine' projector_type but found {config.projector_type}" |
|
|
) |
|
|
|
|
|
|
|
|
def flux(config: FLOPSConfig): |
|
|
"""Model FLOPs for FLUX""" |
|
|
|
|
|
hs = config.hs |
|
|
seq_len = config.model_channels + config.inp_s |
|
|
base_factor = 6 * config.gbs |
|
|
|
|
|
|
|
|
joint_layer_flops = ( |
|
|
base_factor |
|
|
* config.layers[0] |
|
|
* ( |
|
|
10 * hs * hs |
|
|
+ 2 * hs * (config.model_channels + config.inp_s) * (1 + hs * 7) |
|
|
+ 2 * (config.model_channels + config.inp_s) * hs |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
single_layer_flops = ( |
|
|
base_factor |
|
|
* config.layers[1] |
|
|
* seq_len |
|
|
* hs |
|
|
* ( |
|
|
3 |
|
|
+ 1 |
|
|
+ 4 * hs |
|
|
+ (3 * hs + 2 * seq_len) |
|
|
+ 5 * hs |
|
|
+ 1 |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
other_flops = base_factor * ( |
|
|
config.inp_s * config.in_channels * hs |
|
|
+ config.inp_s * hs * config.model_channels |
|
|
+ config.vec_in_dim * hs |
|
|
+ hs * hs |
|
|
+ 2 * (config.model_channels * hs + hs * hs) |
|
|
+ (config.inp_s * config.in_channels * hs) / config.gbs |
|
|
) |
|
|
|
|
|
return joint_layer_flops + single_layer_flops + other_flops |
|
|
|
|
|
|
|
|
def deepseekv3(config: FLOPSConfig): |
|
|
"""Model FLOPs for DeepSeek V3""" |
|
|
|
|
|
|
|
|
bmm1_flops = ( |
|
|
0.5 * (config.qk_head_dim + config.qk_pos_emb_head_dim) * config.attention_heads * (config.enc_seq_len**2) |
|
|
) |
|
|
bmm2_flops = 0.5 * config.v_head_dim * config.attention_heads * (config.enc_seq_len**2) |
|
|
per_input_attention_flops = 6 * (bmm1_flops + bmm2_flops) * config.layers |
|
|
if config.mtp_num_layers is not None: |
|
|
per_input_attention_flops += 6 * (bmm1_flops + bmm2_flops) * config.mtp_num_layers |
|
|
|
|
|
|
|
|
per_layer_mla_params = config.hs * config.q_lora_rank + config.q_lora_rank * ( |
|
|
(config.qk_head_dim + config.qk_pos_emb_head_dim) * config.attention_heads |
|
|
) |
|
|
per_layer_mla_params += config.hs * config.qk_pos_emb_head_dim |
|
|
per_layer_mla_params += config.hs * config.kv_lora_rank + config.kv_lora_rank * ( |
|
|
(config.qk_head_dim + config.v_head_dim) * config.attention_heads |
|
|
) |
|
|
per_layer_mla_params += config.v_head_dim * config.attention_heads * config.hs |
|
|
mla_params = per_layer_mla_params * config.layers |
|
|
if config.mtp_num_layers is not None: |
|
|
mla_params += per_layer_mla_params * config.mtp_num_layers |
|
|
|
|
|
dense_layer_ffn_params = config.hs * config.ffn_hs * 3 |
|
|
per_shared_expert_params = config.hs * config.moe_shared_expert_intermediate_size * 3 |
|
|
per_selected_expert_params = config.hs * config.moe_ffn_hidden_size * 3 |
|
|
ffn_params = 0 |
|
|
|
|
|
if isinstance(config.moe_layer_freq, int): |
|
|
moe_layer_pattern = [1 if (i % config.moe_layer_freq == 0) else 0 for i in range(config.layers)] |
|
|
else: |
|
|
moe_layer_pattern = config.moe_layer_freq |
|
|
for i in moe_layer_pattern: |
|
|
if i == 0: |
|
|
ffn_params += dense_layer_ffn_params |
|
|
else: |
|
|
ffn_params += per_shared_expert_params + (per_selected_expert_params * config.moe_router_topk) |
|
|
if config.mtp_num_layers is not None: |
|
|
for i in range(config.mtp_num_layers): |
|
|
ffn_params += per_shared_expert_params + (per_selected_expert_params * config.moe_router_topk) |
|
|
per_input_params = mla_params + ffn_params |
|
|
per_input_linear_flops = 6 * per_input_params * config.enc_seq_len |
|
|
|
|
|
|
|
|
per_input_vocab_flops = 6 * config.vocab_size * config.hs * config.enc_seq_len |
|
|
if config.mtp_num_layers is not None: |
|
|
for i in range(config.mtp_num_layers): |
|
|
per_input_vocab_flops += 6 * config.vocab_size * config.hs * config.enc_seq_len |
|
|
per_input_vocab_flops += 6 * config.hs * 2 * config.hs * config.enc_seq_len |
|
|
|
|
|
return (per_input_attention_flops + per_input_linear_flops + per_input_vocab_flops) * config.gbs |
|
|
|
|
|
|
|
|
def _nemotronh_mlp_layer_flops(config: FLOPSConfig): |
|
|
"""Model FLOPs for MLP layer. Assume gated linear unit.""" |
|
|
return 6 * config.gbs * config.enc_seq_len * config.hs * config.ffn_hs * 3 |
|
|
|
|
|
|
|
|
def _non_mla_attn_layer_flops(config: FLOPSConfig): |
|
|
"""Model FLOPs for attention layer""" |
|
|
return ( |
|
|
6 |
|
|
* config.gbs |
|
|
* config.enc_seq_len |
|
|
* config.hs |
|
|
* ( |
|
|
config.hs |
|
|
+ config.query_groups / config.attention_heads * config.hs * 2 |
|
|
+ config.enc_seq_len / 2 * 2 |
|
|
+ config.hs |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
def _mamba_layer_flops(config: FLOPSConfig): |
|
|
"""Model FLOPs for Mamba layer. We ignore part of the flops of scan because the |
|
|
chunk size is not known from model config.""" |
|
|
assert config.mamba_state_dim is not None |
|
|
assert config.mamba_head_dim is not None |
|
|
|
|
|
if config.mamba_num_heads: |
|
|
nheads = config.mamba_num_heads |
|
|
else: |
|
|
nheads = 2 * config.hs // config.mamba_head_dim |
|
|
d_in = nheads * config.mamba_head_dim |
|
|
return ( |
|
|
( |
|
|
6 |
|
|
* config.gbs |
|
|
* config.enc_seq_len |
|
|
* config.hs |
|
|
* (2 * d_in + 2 * config.mamba_num_groups * config.mamba_state_dim + nheads) |
|
|
) |
|
|
+ (3 * 2 * config.gbs * config.enc_seq_len * d_in * config.mamba_state_dim) |
|
|
+ (6 * config.gbs * config.enc_seq_len * d_in * config.hs) |
|
|
) |
|
|
|
|
|
|
|
|
def _hybrid_model_flops(config: FLOPSConfig): |
|
|
"""Model FLOPs for hybrid model""" |
|
|
assert config.is_hybrid_model == True |
|
|
assert config.hybrid_override_pattern is not None |
|
|
|
|
|
num_attn_layers, num_mamba_layers, num_mlp_layers = 0, 0, 0 |
|
|
for c in config.hybrid_override_pattern: |
|
|
if c == 'M': |
|
|
num_mamba_layers += 1 |
|
|
elif c == '-': |
|
|
num_mlp_layers += 1 |
|
|
elif c == '*': |
|
|
num_attn_layers += 1 |
|
|
return ( |
|
|
num_attn_layers * _non_mla_attn_layer_flops(config) |
|
|
+ num_mamba_layers * _mamba_layer_flops(config) |
|
|
+ num_mlp_layers * _nemotronh_mlp_layer_flops(config) |
|
|
+ 6 * config.gbs * config.enc_seq_len * config.hs * config.vocab_size |
|
|
) |
|
|
|
|
|
|
|
|
def nemotronh(config: FLOPSConfig): |
|
|
"""Model FLOPs for NemotronH""" |
|
|
return _hybrid_model_flops(config) |
|
|
|