arithmetic-grpo / tests /utils /test_flops_counter.py

initial clean commit

1faccd4 29 days ago

20.4 kB

	# Copyright 2024 Bytedance Ltd. and/or its affiliates
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import math

	import pytest

	from verl.utils.flops_counter import FlopsCounter

	VALID_CONFIG_TYPE = {"llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3", "mistral", "gemma3_text", "apertus"}


	class Config:
	def __init__(self, config_dict):
	for key, value in config_dict.items():
	if isinstance(value, dict):
	value = Config(value)
	setattr(self, key, value)


	CONFIG = {
	"llama": {
	"config": { # llama2-7B
	"model_type": "llama",
	"vocab_size": 32000,
	"hidden_size": 4096,
	"intermediate_size": 11008,
	"num_hidden_layers": 32,
	"num_attention_heads": 32,
	"num_key_value_heads": 32,
	},
	"batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
	# 6(vocabhidden2+layer(hidden(q+k+v+headhead_dim)+ hiddeninter3))*token_sum +
	# 6sum(seqlen^2)layerheadhead_dim
	# 6(3200040962+32(409640964+4096110083))*(512+1024+2048) +
	# 6(512512+10241024+20482048)324096
	# 6(3200040962+32(409640964+4096110083))*(4096+4096+4096) +
	# 6(40964096+40964096+40964096)324096
	"expected_flops_tuple": (149226491215872 / 1e12, 536372695793664 / 1e12),
	},
	"qwen2": {
	"config": { # Qwen/Qwen2.5-7B-Instruct
	"model_type": "qwen2",
	"vocab_size": 152064,
	"hidden_size": 3584,
	"intermediate_size": 18944,
	"num_hidden_layers": 28,
	"num_attention_heads": 28,
	"num_key_value_heads": 4,
	},
	"batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
	# 6(vocabhidden2+layer(hidden(q+k+v+headhead_dim)+ hiddeninter3))*token_sum +
	# 6sum(seqlen^2)layerheadhead_dim
	# 6(15206435842+28(3584(3584+512+512+3584)+3584189443))(512+1024+2048) +
	# 6(512512+10241024+20482048)283584
	# 6(15206435842+28(3584(3584+512+512+3584)+3584189443))(4096+4096+4096) +
	# 6(40964096+40964096+40964096)283584
	"expected_flops_tuple": (167073690943488 / 1e12, 591764889010176 / 1e12),
	},
	"qwen3": {
	"config": { # Qwen/Qwen3-8B
	"model_type": "qwen3",
	"vocab_size": 151936,
	"hidden_size": 4096,
	"intermediate_size": 12288,
	"num_hidden_layers": 36,
	"num_attention_heads": 32,
	"num_key_value_heads": 8,
	"head_dim": 128,
	},
	"batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
	# 6(vocabhidden2+layer(hidden(q+k+v+headhead_dim)+ hiddeninter3))*token_sum +
	# 6sum(seqlen^2)layerheadhead_dim
	# 6(15193640962+36(4096(12832+12882+12832)+4096122883))(512+1024+2048) +
	# 6(512512+10241024+20482048)36128*32
	# 6(15193640962+36(4096(12832+12882+12832)+4096122883))(4096+4096+4096) +
	# 6(40964096+40964096+40964096)36128*32
	"expected_flops_tuple": (180997438046208 / 1e12, 648394032807936 / 1e12),
	},
	"qwen3_moe": {
	"config": { # Qwen/Qwen3-30B-A3B-Base
	"model_type": "qwen3_moe",
	"hidden_size": 2048,
	"vocab_size": 151936,
	"num_hidden_layers": 48,
	"num_key_value_heads": 4,
	"num_attention_heads": 32,
	"head_dim": 128,
	"moe_intermediate_size": 768,
	"num_experts_per_tok": 8,
	"num_experts": 128,
	},
	"batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
	# 6(vocabhidden2+layer(hidden(q+k+v+headhead_dim)+hiddenintertop_k_exp*3 +
	# hiddennum_experts))token_sum + 6sum(seqlen^2)layerheadhead_dim
	# 6(15193620482+48(2048(12832+12842+12832)+204876883+2048128))(512+1024+2048) +
	# 6(512512+10241024+20482048)48128*32
	# 6(15193620482+48(2048(12832+12842+12832)+204876883+2048128))(4096+4096+4096) +
	# 6(40964096+40964096+40964096)48128*32
	"expected_flops_tuple": (78593069678592 / 1e12, 306570470621184 / 1e12),
	},
	"deepseek_v3": {
	"config": { # deepseek-ai/DeepSeek-Prover-V2-671B
	"model_type": "deepseek_v3",
	"hidden_size": 7168,
	"vocab_size": 129280,
	"moe_intermediate_size": 2048,
	"num_hidden_layers": 61,
	"first_k_dense_replace": 3,
	"num_attention_heads": 128,
	"n_routed_experts": 256,
	"num_experts_per_tok": 8,
	"n_shared_experts": 1,
	"kv_lora_rank": 512,
	"qk_rope_head_dim": 64,
	"v_head_dim": 128,
	"intermediate_size": 18432,
	"qk_nope_head_dim": 128,
	"q_lora_rank": 1536,
	},
	"batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
	# (15367168+1281921536+7168(512+64)+128(128+128)512+1281287168) = 187105280
	# 6(12928071682+ 3(7168184323+187105280)+ 58(187105280+7168256+7168204893))(512+1024+2048) +
	# 3(512512+10241024+20482048)61(192+128)*128
	# 6(12928071682+ 3(7168184323+187105280)+ 58(187105280+7168256+7168204893))(4096+4096+4096) +
	# 3(40964096+40964096+40964096)61(192+128)*128
	"expected_flops_tuple": (848766538088448 / 1e12, 3145850406567936 / 1e12),
	},
	"mistral": {
	"config": { # mistralai/Mistral-Small-24B-Instruct-2501
	"model_type": "mistral",
	"vocab_size": 131072,
	"hidden_size": 5120,
	"intermediate_size": 32768,
	"num_hidden_layers": 40,
	"num_attention_heads": 32,
	"num_key_value_heads": 8,
	"head_dim": 128,
	},
	"batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
	# Mistral uses same architecture as Llama, with GQA
	# 6(vocabhidden2+layer(hidden(q+k+v+headhead_dim)+ hiddeninter3))*token_sum +
	# 12sum(seqlen^2)layerheadhead_dim
	# vocab part: 13107251202 = 1342177280
	# attn part per layer: 5120(12832+1288+1288+12832) = 512010240 = 52428800
	# mlp part per layer: 5120327683 = 503316480
	# total per layer: 52428800 + 503316480 = 555745280
	# all layers: 1342177280 + 40*555745280 = 23571988480
	# For batch [512, 1024, 2048], tokens_sum = 3584:
	# dense flops: 6 * 23571988480 * 3584 = 506892040273920
	# attn flops: 6 * 5505024 * 40 * 128 * 32 = 10823317585920
	# total: 517715357859840 / 1e12 = 517.71535785984
	# For batch [4096, 4096, 4096], tokens_sum = 12288:
	# dense flops: 6 * 23571988480 * 12288 = 1737915566653440
	# attn flops: 6 * 50331648 * 40 * 128 * 32 = 98956046499840
	# total: 1836871613153280 / 1e12 = 1836.87161315328
	"expected_flops_tuple": (512303699066880 / 1e12, 1787393589903360 / 1e12),
	},
	"gemma3_text": {
	"config": { # Gemma3-12B-IT-TextOnly
	"model_type": "gemma3_text",
	"vocab_size": 262208,
	"hidden_size": 3840,
	"intermediate_size": 15360,
	"num_hidden_layers": 48,
	"num_attention_heads": 16,
	"num_key_value_heads": 8,
	"head_dim": 256,
	"sliding_window": 1024,
	"layer_types": None,
	# Will be auto-generated based on sliding_window_pattern
	"sliding_window_pattern": 6,
	# Every 6th layer is full attention
	},
	"batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
	# Gemma3 has alternating sliding window attention
	# With sliding_window_pattern=6: layers 5,11,17,23,29,35,41,47 use full attention (8 layers)
	# Other 40 layers use sliding window attention with window_size=1024
	#
	# Non-attention FLOPs:
	# vocab part: 26220838402 = 2013757440
	# attn part per layer: 3840(25616+2568+2568+25616) = 384012288 = 47185920
	# mlp part per layer: 3840153603 = 176947200
	# total per layer: 47185920 + 176947200 = 224133120
	# all layers: 2013757440 + 48*224133120 = 12772147200
	#
	# For batch [512, 1024, 2048], tokens_sum = 3584:
	# dense flops: 6 * 12772147200 * 3584 = 274652253388800
	# seqlen_square_sum: 180355072 (calculated with sliding window logic)
	# attn flops: 6 * 180355072 * 256 * 16 = 8864812498944
	# total: 283517065887744 / 1e12 = 283.517065887744
	#
	# For batch [4096, 4096, 4096], tokens_sum = 12288:
	# dense flops: 6 * 12772147200 * 12288 = 941664868761600
	# seqlen_square_sum: 905969664 (calculated with sliding window logic)
	# attn flops: 6 * 905969664 * 256 * 16 = 44530220924928
	# total: 986195089686528 / 1e12 = 986.195089686528
	"expected_flops_tuple": (279084659638272 / 1e12, 963929979224064 / 1e12),
	},
	"gpt_oss": {
	"config": {
	"model_type": "gpt_oss",
	"vocab_size": 201088,
	"hidden_size": 2880,
	"num_hidden_layers": 24,
	"num_attention_heads": 64,
	"num_key_value_heads": 8,
	"head_dim": 64,
	"intermediate_size": 2880,
	"num_local_experts": 32,
	"num_experts_per_tok": 4,
	"sliding_window": 128,
	"layer_types": [
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"full_attention",
	"sliding_attention",
	"full_attention",
	],
	},
	"batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
	# GPT-OSS has alternating sliding / full attention
	# Even layers (12 layers) use sliding window attention with window_size = 128
	# Odd layers (12 layers) use full attention
	#
	# Non-attention FLOPs:
	# vocab part: 201088 * 2880 * 2 = 1158266880
	# attn linear part per layer:
	# Q: 2880 * (64 * 64) = 11796480
	# K: 2880 * (8 * 64) = 1474560
	# V: 2880 * (8 * 64) = 1474560
	# O: (64 * 64) * 2880 = 11796480
	# attn linear total = 26542080
	# mlp (MoE, SwiGLU) part per layer:
	# gate: 2880 * 32 = 92160
	# active experts: 3 * 2880 * 2880 * 4 = 99532800
	# mlp total = 99624960
	# total per layer: 26542080 + 99624960 = 126167040
	# all layers:
	# 126167040 * 24 = 3028008960
	# total dense params:
	# 3028008960 + 1158266880 = 4186275840
	#
	# For batch [512, 1024, 2048], tokens_sum = 3584:
	# dense flops: 6 * 4186275840 * 3584 = 90021675663360
	# seqlen_square_sum: 71565312 (calculated with sliding window logic)
	# attn flops: 6 * 71565312 * 64 * 64 = 3517578215424
	# total: 93539253878784 / 1e12 = 93.539253878784
	#
	# For batch [4096, 4096, 4096], tokens_sum = 12288:
	# dense flops: 6 * 4186275840 * 12288 = 308646629068800
	# seqlen_square_sum: 622854144 (calculated with sliding window logic)
	# attn flops: 6 * 622854144 * 64 * 64 = 30613642948608
	# total: 339260272017408 / 1e12 = 339.260272017408
	"expected_flops_tuple": (91780464771072 / 1e12, 323953008574464 / 1e12),
	},
	"apertus": {
	"config": { # swiss-ai/Apertus-8B
	"model_type": "apertus",
	"vocab_size": 131072,
	"hidden_size": 4096,
	"intermediate_size": 21504,
	"num_hidden_layers": 32,
	"num_attention_heads": 32,
	"num_key_value_heads": 32,
	"hidden_act": "xielu",
	# head_dim will be derived as 4096 / 32 = 128
	},
	"batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
	# Calculation for Apertus (hidden_act="xielu" -> MLP uses [k_mlp=2]HI params; qk_norm=True -> [k_qkn=2]*H):
	# V=131072, H=4096, I=21504, L=32, k_mlp=2 (XIELU), k_qkn=2 (QK norm), S=6
	# S(2VH + L(4H2 + k_mlpHI + k_qknH)) * (SUM[seqlen]) + 6SUM[seqlen2]L*H
	"expected_flops_tuple": (194825353691136 / 1e12, 692711652851712 / 1e12),
	},
	"qwen3_vl": {
	"config": { # Qwen/Qwen3-VL-8B
	"model_type": "qwen3_vl",
	# -------- Text config --------
	"text_config": {
	"vocab_size": 151936,
	"hidden_size": 4096,
	"intermediate_size": 12288,
	"num_hidden_layers": 36,
	"num_attention_heads": 32,
	"num_key_value_heads": 8,
	"head_dim": 128,
	},
	# -------- Vision config (ViT) --------
	"vision_config": {
	"deepstack_visual_indexes": [8, 16, 24],
	"num_heads": 16,
	"depth": 27,
	"hidden_size": 1152,
	"intermediate_size": 4304,
	"out_hidden_size": 4096,
	"spatial_merge_size": 2,
	"temporal_patch_size": 2,
	"in_channels": 3,
	"patch_size": 16,
	},
	},
	"batch_seqlens_tuple": (
	[512, 1024, 2048],
	[4096, 4096, 4096],
	),
	"images_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
	# -----Text-----
	# 6(vocabhidden*2
	# + layer(hidden(q+k+v+o) + hiddeninter3)
	# )*token_sum
	# + 6sum(seqlen^2)layer*hidden
	#
	# -----ViT-----
	# patch_embed_N =hiddentemporal_patch_sizein_channels* patch_size^2
	# attn_linear_N =hidden(4hidden)
	# mlp_N =hiddeninter2
	# merger_N =((o+hiddenspatial_merge_size^2) (hidden*spatial_merge_size^2))
	# deepstack_merger_N =merger_N * 3
	# dense_N =patch_embed_N + (attn_linear_N + mlp_N) * 27 + deepstack_merger_N + merger_N
	#
	# 6(1519364096*2
	# + 36(4096(4096+1024+1024+4096) + 4096122883)
	# )*(512+1024+2048)
	# + 12(512512+10241024+20482048)364096
	# + 6 * dense_N * (512 + 1024 + 2048)
	# + 12 * (5122 + 10242 + 2048*2) 27 * 16 * 72
	#
	# 6(1519364096*2
	# + 36(4096(4096+1024+1024+4096) + 4096122883)
	# )*(4096+4096+4096)
	# + 12(40964096+40964096+40964096)364096
	# + 6 * dense_N * (4096 + 4096 + 2048)
	# + 12 * (40962 + 40962 + 4096*2) 27 * 16 * 72
	"expected_flops_tuple": (
	195379819708416 / 1e12,
	709446422495232 / 1e12,
	),
	},
	"qwen3_vl_moe": {
	"config": { # Qwen/Qwen3-VL-30B-A3B
	"model_type": "qwen3_vl_moe",
	# -------- Text config --------
	"text_config": {
	"vocab_size": 151936,
	"hidden_size": 2048,
	"num_hidden_layers": 48,
	"num_attention_heads": 32,
	"num_key_value_heads": 4,
	"head_dim": 128,
	"moe_intermediate_size": 768,
	"num_experts": 128,
	"num_experts_per_tok": 8,
	},
	# -------- Vision config (ViT) --------
	"vision_config": {
	"deepstack_visual_indexes": [8, 16, 24],
	"num_heads": 16,
	"depth": 27,
	"hidden_size": 1152,
	"intermediate_size": 4304,
	"out_hidden_size": 4096,
	"spatial_merge_size": 2,
	"temporal_patch_size": 2,
	"in_channels": 3,
	"patch_size": 16,
	},
	},
	"batch_seqlens_tuple": (
	[512, 1024, 2048],
	[4096, 4096, 4096],
	),
	"images_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
	# -----Text-----
	# 6(vocabhidden*2
	# + layer(hidden(q+k+v+headhead_dim)+hiddenintertop_k_exp3+hidden*num_experts)
	# )*token_sum
	# + 6sum(seqlen^2)layer*hidden
	#
	# -----ViT-----
	# patch_embed_N =hiddentemporal_patch_sizein_channels* patch_size^2
	# attn_linear_N =hidden(4hidden)
	# mlp_N =hiddeninter2
	# merger_N =((o+hiddenspatial_merge_size^2) (hidden*spatial_merge_size^2))
	# deepstack_merger_N =merger_N * 3
	# dense_N =patch_embed_N + (attn_linear_N + mlp_N) * 27 + deepstack_merger_N + merger_N
	#
	# 6(1519362048*2
	# + 48(2048(12832+12842+12832)+204876883+2048128)
	# )*(512+1024+2048)
	# + 12(512512+10241024+20482048)484096
	# + 6 * dense_N * (512 + 1024 + 2048)
	# + 12 * (5122 + 10242 + 2048*2) 27 * 16 * 72
	#
	# 6(1519362048*2
	# 48(2048(12832+12842+12832)+204876883+2048128)
	# )*(4096+4096+4096)
	# + 12(40964096+40964096+40964096)484096
	# + 6 * dense_N * (4096 + 4096 + 2048)
	# + 12 * (40962 + 40962 + 4096*2) 27 * 16 * 72
	"expected_flops_tuple": (
	92975451340800 / 1e12,
	367622860308480 / 1e12,
	),
	},
	}


	@pytest.mark.parametrize(
	"config_type",
	[
	"llama",
	"qwen2",
	"qwen3",
	"qwen3_moe",
	"deepseek_v3",
	"mistral",
	"gemma3_text",
	"apertus",
	"gpt_oss",
	"qwen3_vl",
	"qwen3_vl_moe",
	],
	)
	def test_flops_counter(config_type: str):
	test_config = CONFIG[config_type]
	config = Config(test_config["config"])
	flops_counter = FlopsCounter(config)
	if "images_seqlens_tuple" in test_config:
	for batch_seqlens, images_seqlens, expected_flops in zip(
	test_config["batch_seqlens_tuple"],
	test_config["images_seqlens_tuple"],
	test_config["expected_flops_tuple"],
	strict=True,
	):
	# set delta time to 1 to get the flops
	counted_flops, _ = flops_counter.estimate_flops(batch_seqlens, 1, images_seqlens=images_seqlens)
	print(f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}")
	assert math.isclose(counted_flops, expected_flops), (
	f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}"
	)
	else:
	for batch_seqlens, expected_flops in zip(
	test_config["batch_seqlens_tuple"], test_config["expected_flops_tuple"], strict=True
	):
	# set delta time to 1 to get the flops
	counted_flops, _ = flops_counter.estimate_flops(batch_seqlens, 1)
	print(f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}")
	assert math.isclose(counted_flops, expected_flops), (
	f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}"
	)