Spaces:

braindeck
/

text2text

Paused

text2text / verl /models /mcore /weight_converter.py

braindeck

Initial commit

bcdf9fa 5 months ago

11.3 kB

	# Copyright 2025 Bytedance Ltd. and/or its affiliates
	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# online convert mcore weight to pure huggingface weight, no any fusion
	# including format conversion and name mapping
	# not including resharding
	import torch
	from megatron.core.transformer import TransformerConfig
	from transformers import PretrainedConfig


	class McoreToHFWeightConverterBase:
	def __init__(self, hf_config: PretrainedConfig, mcore_config: TransformerConfig):
	self.hf_config = hf_config
	self.mcore_config = mcore_config

	def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> torch.Tensor:
	raise NotImplementedError


	class McoreToHFWeightConverterDense(McoreToHFWeightConverterBase):
	def _convert_attention_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
	# 'decoder.layers.0.self_attention.linear_proj.weight'
	# 'decoder.layers.0.self_attention.linear_qkv.layer_norm_weight'
	# 'decoder.layers.0.self_attention.linear_qkv.weight'
	# 'decoder.layers.0.self_attention.linear_qkv.bias'
	layer_number = name.split(".")[2]
	convert_names = []
	if "self_attention.linear_qkv.bias" in name or "self_attention.linear_qkv.weight" in name:
	param_type = name.split(".")[-1]
	assert param_type == "bias" or param_type == "weight"
	convert_names.append(f"model.layers.{layer_number}.self_attn.q_proj.{param_type}")
	convert_names.append(f"model.layers.{layer_number}.self_attn.k_proj.{param_type}")
	convert_names.append(f"model.layers.{layer_number}.self_attn.v_proj.{param_type}")
	assert len(params) == 3
	elif "self_attention.linear_proj.weight" in name:
	convert_names.append(f"model.layers.{layer_number}.self_attn.o_proj.weight")
	assert len(params) == 1
	elif "self_attention.linear_qkv.layer_norm_weight" in name:
	convert_names.append(f"model.layers.{layer_number}.input_layernorm.weight")
	assert len(params) == 1
	elif "self_attention.q_layernorm.weight" in name:
	convert_names.append(f"model.layers.{layer_number}.self_attn.q_norm.weight")
	assert len(params) == 1
	elif "self_attention.k_layernorm.weight" in name:
	convert_names.append(f"model.layers.{layer_number}.self_attn.k_norm.weight")
	assert len(params) == 1
	else:
	raise NotImplementedError(f"Unsupported parameter name: {name}")
	return convert_names, params

	def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
	# 'decoder.layers.0.mlp.linear_fc1.layer_norm_weight'
	# 'decoder.layers.0.mlp.linear_fc1.weight'
	# 'decoder.layers.0.mlp.linear_fc2.weight'
	layer_number = name.split(".")[2]
	convert_names = []
	if "mlp.linear_fc1.weight" in name:
	# split gate_proj and up_proj
	convert_names.append(f"model.layers.{layer_number}.mlp.gate_proj.weight")
	convert_names.append(f"model.layers.{layer_number}.mlp.up_proj.weight")
	assert len(params) == 2
	elif "mlp.linear_fc1.layer_norm_weight" in name:
	convert_names.append(f"model.layers.{layer_number}.post_attention_layernorm.weight")
	assert len(params) == 1
	elif "mlp.linear_fc2.weight" in name:
	convert_names.append(f"model.layers.{layer_number}.mlp.down_proj.weight")
	assert len(params) == 1
	else:
	raise NotImplementedError(f"Unsupported parameter name: {name}")
	return convert_names, params

	def convert_param(self, name: str, params_one_group: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
	direct_name_mapping = {
	"embedding.word_embeddings.weight": "model.embed_tokens.weight",
	"decoder.final_layernorm.weight": "model.norm.weight",
	"output_layer.weight": "lm_head.weight",
	}
	if name in direct_name_mapping:
	return [direct_name_mapping[name]], [params_one_group[0]]

	if "self_attention" in name:
	return self._convert_attention_param(name, params_one_group)
	elif "mlp" in name:
	return self._convert_mlp_param(name, params_one_group)
	else:
	raise NotImplementedError(f"Unsupported parameter name: {name}")


	class McoreToHFWeightConverterQwen2Moe(McoreToHFWeightConverterDense):
	def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
	# 'decoder.layers.0.pre_mlp_layernorm.weight',
	# 'decoder.layers.0.mlp.router.weight',
	# 'decoder.layers.0.mlp.shared_experts.gate_weight',
	# 'decoder.layers.0.mlp.shared_experts.linear_fc1.weight',
	# 'decoder.layers.0.mlp.shared_experts.linear_fc2.weight'
	# moe1
	# 'decoder.layers.0.mlp.experts.linear_fc1.weight0',
	# 'decoder.layers.0.mlp.experts.linear_fc1.weight1',
	# 'decoder.layers.0.mlp.experts.linear_fc1.weight2',
	# 'decoder.layers.0.mlp.experts.linear_fc1.weight3',
	# moe2
	# 'decoder.layers.0.mlp.experts.linear_fc2.weight0',
	# 'decoder.layers.0.mlp.experts.linear_fc2.weight1',
	layer_number = name.split(".")[2]
	convert_names = []
	if "pre_mlp_layernorm" in name:
	convert_names.append(f"model.layers.{layer_number}.post_attention_layernorm.weight")
	assert len(params) == 1
	elif "mlp.router.weight" in name:
	convert_names.append(f"model.layers.{layer_number}.mlp.gate.weight")
	assert len(params) == 1
	elif "shared_experts.gate_weight" in name:
	convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert_gate.weight")
	assert len(params) == 1
	elif "shared_experts.linear_fc1.weight" in name: # split gate_proj and up_proj
	convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert.gate_proj.weight")
	convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert.up_proj.weight")
	assert len(params) == 2
	elif "shared_experts.linear_fc2.weight" in name:
	convert_names.append(f"model.layers.{layer_number}.mlp.shared_expert.down_proj.weight")
	assert len(params) == 1
	elif "mlp.experts.linear_fc1" in name: # split gate_proj and up_proj
	expert_id = name.split("weight")[-1]
	convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.gate_proj.weight")
	convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.up_proj.weight")
	assert len(params) == 2
	elif "mlp.experts.linear_fc2" in name:
	expert_id = name.split("weight")[-1]
	convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.down_proj.weight")
	assert len(params) == 1
	else:
	raise NotImplementedError(f"Unsupported parameter name: {name}")
	return convert_names, params


	class McoreToHFWeightConverterMixtral(McoreToHFWeightConverterDense):
	def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
	# decoder.layers.0.mlp.router.weight
	# decoder.layers.0.mlp.experts.linear_fc1.weight0 - weight7
	# decoder.layers.0.mlp.experts.linear_fc2.weight0 - weight7

	layer_number = name.split(".")[2]
	convert_names = []
	if "pre_mlp_layernorm" in name:
	convert_names.append(f"model.layers.{layer_number}.post_attention_layernorm.weight")
	elif "mlp.router.weight" in name:
	convert_names.append(f"model.layers.{layer_number}.block_sparse_moe.gate.weight")
	elif "mlp.experts.linear_fc1.weight" in name:
	expert_id = name.split("weight")[-1]
	convert_names.append(f"model.layers.{layer_number}.block_sparse_moe.experts.{expert_id}.w1.weight")
	convert_names.append(f"model.layers.{layer_number}.block_sparse_moe.experts.{expert_id}.w3.weight")
	elif "mlp.experts.linear_fc2.weight" in name:
	expert_id = name.split("weight")[-1]
	convert_names.append(f"model.layers.{layer_number}.block_sparse_moe.experts.{expert_id}.w2.weight")
	else:
	raise NotImplementedError(f"Unsupported parameter name: {name}")
	return convert_names, params


	class McoreToHFWeightConverterQwen3Moe(McoreToHFWeightConverterDense):
	def _convert_mlp_param(self, name: str, params: list[torch.Tensor]) -> tuple[list[str], list[torch.Tensor]]:
	# qwen3 moe no share expert

	# 'decoder.layers.0.pre_mlp_layernorm.weight',
	# 'decoder.layers.0.mlp.router.weight',
	# moe1
	# 'decoder.layers.0.mlp.experts.linear_fc1.weight0',
	# 'decoder.layers.0.mlp.experts.linear_fc1.weight1',
	# 'decoder.layers.0.mlp.experts.linear_fc1.weight2',
	# 'decoder.layers.0.mlp.experts.linear_fc1.weight3',
	# moe2
	# 'decoder.layers.0.mlp.experts.linear_fc2.weight0',
	# 'decoder.layers.0.mlp.experts.linear_fc2.weight1',
	layer_number = name.split(".")[2]
	convert_names = []
	if "pre_mlp_layernorm" in name:
	convert_names.append(f"model.layers.{layer_number}.post_attention_layernorm.weight")
	assert len(params) == 1
	elif "mlp.router.weight" in name:
	convert_names.append(f"model.layers.{layer_number}.mlp.gate.weight")
	assert len(params) == 1
	elif "mlp.experts.linear_fc1" in name: # split gate_proj and up_proj
	expert_id = name.split("weight")[-1]
	convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.gate_proj.weight")
	convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.up_proj.weight")
	assert len(params) == 2
	elif "mlp.experts.linear_fc2" in name:
	expert_id = name.split("weight")[-1]
	convert_names.append(f"model.layers.{layer_number}.mlp.experts.{expert_id}.down_proj.weight")
	assert len(params) == 1
	else:
	raise NotImplementedError(f"Unsupported parameter name: {name}")
	return convert_names, params