NeMo / scripts /nlp_language_modeling /hf_t5-v1_1_to_nemo.py

thanks to NVIDIA ❤

7934b29 almost 3 years ago

17.8 kB

	# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	This script generates a NeMo-Megatron compatible `.nemo` file for a Huggingface T5-v1_1 model.

	List of Huggingface models that this script can covert:

	1. google/t5-v1_1-small
	2. google/t5-v1_1-base
	3. google/t5-v1_1-large
	4. google/t5-v1_1-xl
	5. google/t5-v1_1-xxl
	6. google/mt5-small
	7. google/mt5-base
	8. google/mt5-large
	9. google/mt5-xl
	10. google/mt5-xxl
	11. google/ul2
	13. bigscience/T0pp
	14. google/t5-small-lm-adapt
	15. google/t5-base-lm-adapt
	16. google/t5-large-lm-adapt
	17. google/t5-xl-lm-adapt
	18. google/t5-xxl-lm-adapt
	19. google/flan-t5-small
	20. google/flan-t5-base
	21. google/flan-t5-large
	22. google/flan-t5-xl
	23. google/flan-t5-xxl

	Use instructions:

	python hf_t5-v1_1_to_nemo.py \
	--hf_model_name bigscience/T0pp \
	--nemo_state_dict /path/to/nemo_state_dict.pt \
	--nemo_file_path /path/to/nemo_file.nemo
	"""
	import collections
	import os
	import tempfile
	from argparse import ArgumentParser

	import torch
	from omegaconf.omegaconf import OmegaConf, open_dict
	from pytorch_lightning import Trainer
	from transformers import AutoTokenizer, T5ForConditionalGeneration

	from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
	from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector

	try:
	import accelerate
	except ImportError:
	raise ImportError("Please install accelerate package via `pip install accelerate` to use this script.")


	def convert_weights(hf_model, nemo_state_dict_path):
	if hf_model == 'google/ul2':
	torch_dtype = torch.bfloat16
	else:
	torch_dtype = torch.float32
	hf_model = T5ForConditionalGeneration.from_pretrained(hf_model, low_cpu_mem_usage=True, torch_dtype=torch_dtype)
	hf_model_config = hf_model.config
	with tempfile.TemporaryDirectory() as tmp:
	torch.save(hf_model.state_dict(), os.path.join(tmp, 'model.pt'))
	hf_weights = torch.load(os.path.join(tmp, 'model.pt'))

	nemo_weights = collections.OrderedDict()

	print(f'Found {len(hf_weights.keys())} keys in the checkpoint')

	def _get_model_type_block_layer(k):
	if k.startswith('encoder'):
	model_type = 'encoder'
	elif k.startswith('decoder'):
	model_type = 'decoder'
	else:
	raise ValueError(f"Unknown model type for {k}")

	return model_type, int(k.split('.')[2]), int(k.split('.')[4])

	for k, v in hf_weights.items():
	#################################################
	###### Enc-Dec Embeddings and Output Layer ######
	#################################################
	# Tied decoder embedding and decoder output layer.
	if k == 'shared.weight':
	pass

	elif k == 'lm_head.weight':
	nemo_weights['enc_dec_model.tokens_head.weight'] = v
	print(
	f'Mapped {k} to enc_dec_model.decoder_embedding.word_embeddings.weight and enc_dec_model.tokens_head.weight'
	)

	# Decoder embeddings
	elif k == 'decoder.embed_tokens.weight':
	nemo_weights['enc_dec_model.decoder_embedding.word_embeddings.weight'] = v

	elif k == 'encoder.embed_tokens.weight':
	nemo_weights['enc_dec_model.encoder_embedding.word_embeddings.weight'] = v
	print(f'Mapped {k} to enc_dec_model.encoder_embedding.word_embeddings.weight')

	#################################################
	################# RPE Weights ###################
	#################################################

	elif k == 'encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight':
	nemo_weights['enc_dec_model.encoder_relative_position_embedding.relative_position_embedding.weight'] = v
	print(
	f'Mapped {k} to enc_dec_model.encoder_relative_position_embedding.relative_position_embedding.weight'
	)

	elif k == 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight':
	nemo_weights['enc_dec_model.decoder_relative_position_embedding.relative_position_embedding.weight'] = v
	print(
	f'Mapped {k} to enc_dec_model.decoder_relative_position_embedding.relative_position_embedding.weight'
	)

	# Block in HF corresponds to layer in NeMo.
	# Layer in HF does not correspond to anything in NeMo. Layer 0 is self attn, layer 1 is cross-attn.

	#################################################
	############### Attention Layers ################
	#################################################

	# Self-Attention

	# Q, k, V in NeMo-Megatron is bundled into a single matrix.
	elif 'SelfAttention.q.weight' in k:
	model_type, block_number, layer_number = _get_model_type_block_layer(k)
	k_weight = hf_weights[k.replace('q.weight', 'k.weight')]
	v_weight = hf_weights[k.replace('q.weight', 'v.weight')]
	concat_weights = torch.cat([v, k_weight, v_weight], dim=0)
	nemo_weights[
	f'enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.self_attention.query_key_value.weight'
	] = concat_weights
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.self_attention.query_key_value.weight'
	)

	# We can skip processing of k, v weights since we already concat them into qkv above.
	elif 'SelfAttention.k.weight' in k or 'SelfAttention.v.weight' in k:
	pass

	# Output self-attn matrix.
	elif 'SelfAttention.o.weight' in k:
	model_type, block_number, layer_number = _get_model_type_block_layer(k)
	block_number = int(k.split('.')[2]) # Block in HF corresponds to layer in NeMo.
	layer_number = int(
	k.split('.')[4]
	) # Layer in HF does not correspond to anything in NeMo. Layer 0 is self attn, layer 1 is cross-attn.
	nemo_weights[
	f'enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.self_attention.dense.weight'
	] = v
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.self_attention.dense.weight'
	)

	# Cross-Attention projection matrices are merged into K, V matrices in NeMo-Megatron
	elif 'EncDecAttention.k.weight' in k:
	model_type, block_number, layer_number = _get_model_type_block_layer(k)
	v_weight = hf_weights[k.replace('k.weight', 'v.weight')]
	concat_weights = torch.cat([v, v_weight], dim=0)
	nemo_weights[
	f'enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.key_value.weight'
	] = concat_weights
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.key_value.weight'
	)

	# We can skip processing of v weights since we already concat them with k above.
	elif 'EncDecAttention.v.weight' in k:
	pass

	# Cross-Attention Q matrix is separate in NeMo-Megatron
	elif 'EncDecAttention.q.weight' in k:
	model_type, block_number, layer_number = _get_model_type_block_layer(k)
	nemo_weights[
	f'enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.query.weight'
	] = v
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.query.weight'
	)

	# Cross-Attention Q matrix is separate in NeMo-Megatron
	elif 'EncDecAttention.o.weight' in k:
	model_type, block_number, layer_number = _get_model_type_block_layer(k)
	nemo_weights[
	f'enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.dense.weight'
	] = v
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.dense.weight'
	)

	#################################################
	#################$ FFN Layers ###################
	#################################################

	elif 'DenseReluDense.wi_0.weight' in k:
	model_type, block_number, layer_number = _get_model_type_block_layer(k)
	nemo_weights[
	f'enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_h_to_4h.weight'
	] = v
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_h_to_4h.weight'
	)

	elif 'DenseReluDense.wi_1.weight' in k:
	model_type, block_number, layer_number = _get_model_type_block_layer(k)
	nemo_weights[
	f'enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_h_to_4h_2.weight'
	] = v
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_h_to_4h_2.weight'
	)

	elif 'DenseReluDense.wo.weight' in k:
	model_type, block_number, layer_number = _get_model_type_block_layer(k)
	nemo_weights[
	f'enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_4h_to_h.weight'
	] = v
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_4h_to_h.weight'
	)

	#################################################
	#################$ LayerNorm ####################
	#################################################

	elif 'layer_norm' in k:
	if 'final' in k:
	model_type = 'encoder' if k.startswith('encoder') else 'decoder'
	nemo_weights[f'enc_dec_model.enc_dec_model.{model_type}.model.final_layernorm.weight'] = v
	print(f'Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.final_layernorm.weight')
	else:
	model_type, block_number, layer_number = _get_model_type_block_layer(k)
	if layer_number == 0 and model_type == 'encoder':
	nemo_weights[
	f'enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.input_layernorm.weight'
	] = v
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.input_layernorm.weight'
	)
	elif layer_number == 1 and model_type == 'encoder':
	nemo_weights[
	f'enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_attention_layernorm.weight'
	] = v
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_attention_layernorm.weight'
	)
	elif layer_number == 0 and model_type == 'decoder':
	nemo_weights[
	f'enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.input_layernorm.weight'
	] = v
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.input_layernorm.weight'
	)
	elif layer_number == 1 and model_type == 'decoder':
	nemo_weights[
	f'enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_attention_layernorm.weight'
	] = v
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_attention_layernorm.weight'
	)
	elif layer_number == 2 and model_type == 'decoder':
	nemo_weights[
	f'enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_inter_attention_layernorm.weight'
	] = v
	print(
	f'Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_inter_attention_layernorm.weight'
	)
	else:
	raise ValueError("Unknown layer_norm key: {}".format(k))
	else:
	raise ValueError(f"Unknown key: {k}")

	torch.save(nemo_weights, nemo_state_dict_path)
	print("Saved weights to {}".format(nemo_state_dict_path))
	return hf_model_config


	def package_into_nemo_file(
	state_dict_path, base_yaml_config, hf_model_config, nemo_file_path, hf_model_name, megatron_amp_O2
	):
	"""
	Packages the state dict, config file and tokenizer into a `.nemo` file.
	"""
	trainer = Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=32)
	base_cfg = OmegaConf.load(base_yaml_config)
	if hf_model_config.dense_act_fn == "silu":
	act_fn = "swiglu"
	elif hf_model_config.dense_act_fn == "gelu_new":
	act_fn = "geglu"
	# FLAN-T5 models have things configured this way.
	elif hf_model_config.dense_act_fn == "gelu" and hf_model_config.is_gated_act:
	act_fn = "geglu"
	else:
	raise ValueError(f"Unknown dense_act_fn: {hf_model_config.dense_act_fn}")

	with open_dict(base_cfg):
	base_cfg.encoder.num_layers = hf_model_config.num_layers
	base_cfg.encoder.hidden_size = hf_model_config.d_model
	base_cfg.encoder.ffn_hidden_size = hf_model_config.d_ff
	base_cfg.encoder.kv_channels = hf_model_config.d_kv
	base_cfg.encoder.num_attention_heads = hf_model_config.num_heads
	base_cfg.encoder.activation = act_fn
	base_cfg.encoder.relative_attention_num_buckets = hf_model_config.relative_attention_num_buckets

	base_cfg.decoder.num_layers = hf_model_config.num_decoder_layers
	base_cfg.decoder.hidden_size = hf_model_config.d_model
	base_cfg.decoder.ffn_hidden_size = hf_model_config.d_ff
	base_cfg.decoder.kv_channels = hf_model_config.d_kv
	base_cfg.decoder.num_attention_heads = hf_model_config.num_heads
	base_cfg.decoder.activation = act_fn
	base_cfg.decoder.relative_attention_num_buckets = hf_model_config.relative_attention_num_buckets

	base_cfg.megatron_amp_O2 = megatron_amp_O2

	with tempfile.TemporaryDirectory() as tmp:
	tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
	tokenizer_path = tokenizer.save_vocabulary(tmp)[0]
	base_cfg.tokenizer.model = tokenizer_path
	model = MegatronT5Model(base_cfg, trainer).to('cpu')
	model._save_restore_connector = NLPSaveRestoreConnector()
	state_dict = torch.load(state_dict_path)
	if megatron_amp_O2:
	new_state_dict = {}
	for key in state_dict.keys():
	new_key = key.replace('model.', 'model.module.', 1)
	new_state_dict[new_key] = state_dict[key]
	state_dict = new_state_dict
	model.load_state_dict(state_dict)
	model.save_to(nemo_file_path)


	if __name__ == '__main__':
	parser = ArgumentParser()
	parser.add_argument(
	"--hf_model_name",
	type=str,
	required=True,
	help="Valid Huggingface T5v1_1 model name ex: google/t5-v1_1-large or google/ul2. Example something that can be loaded with T5ForConditionalGeneration.from_pretrained()",
	)
	parser.add_argument(
	"--nemo_state_dict_path",
	type=str,
	required=True,
	help="Path to write the intermediate nemo state dict file ex: /path/to/nemo_state_dict.pt",
	)
	parser.add_argument(
	"--nemo_file_path",
	type=str,
	required=True,
	help="Path to write the converted .nemo file ex: /path/to/t5_base_converted_to_nemo.nemo",
	)
	parser.add_argument(
	"--base_yaml_config",
	type=str,
	default="hf_t5v1_1_base_config.yaml",
	help="Path to a base yaml config that we edit based on the provided model.",
	)
	parser.add_argument(
	"--megatron_amp_O2",
	action="store_true",
	help="Whether to store O2 weights. This may be useful for models like ul2 where only pre-trained half precision weights were released.",
	)
	args = parser.parse_args()
	if not os.path.exists(args.base_yaml_config):
	raise FileNotFoundError(f"Base yaml config file {args.base_yaml_config} does not exist.")
	hf_model_config = convert_weights(args.hf_model_name, args.nemo_state_dict_path)
	package_into_nemo_file(
	state_dict_path=args.nemo_state_dict_path,
	base_yaml_config=args.base_yaml_config,
	hf_model_config=hf_model_config,
	nemo_file_path=args.nemo_file_path,
	hf_model_name=args.hf_model_name,
	megatron_amp_O2=args.megatron_amp_O2,
	)