NeMo / examples /nlp /language_modeling /megatron_change_num_partitions.py

thanks to NVIDIA ❤

7934b29 almost 3 years ago

10.9 kB

	# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	from argparse import ArgumentParser

	import torch
	from pytorch_lightning import Trainer

	from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
	from nemo.utils import logging, model_utils
	from nemo.utils.app_state import AppState


	"""
	Usage:
	python megatron_change_num_partitions.py \
	--model_file=PATH_TO_SRC_FILE \
	--target_file=PATH_TO_TGT_FILE \
	--tensor_model_parallel_size=2 \
	--target_tensor_model_parallel_size=1
	"""


	def merge_partition(model, partitions, write_path=None):
	idx = 0
	for name, param in model.named_parameters():
	if param.shape == partitions[0][idx].shape:
	concated = partitions[0][idx].data
	elif param.shape[0] == partitions[0][idx].shape[0]:
	concated = torch.cat([partitions[i][idx].data for i in range(len(partitions))], dim=-1)
	else:
	concated = torch.cat([partitions[i][idx].data for i in range(len(partitions))], dim=0)
	if concated.shape != param.shape:
	logging.info(
	f"Warning: Shape mismatch for parameter {name} required shape: {param.shape}, merged shape: {concated.shape}. Narrowing to match required size."
	)
	if concated.shape[1:] == param.shape[1:]:
	concated = torch.narrow(concated, 0, 0, param.shape[0])
	elif concated.shape[:-1] == param.shape[:-1]:
	concated = torch.narrow(concated, -1, 0, param.shape[-1])
	else:
	raise RuntimeError(
	f"Can not handle parameter {name}, required shape: {param.shape}, merged shape: {concated.shape}."
	)
	param.data = concated
	idx += 1

	if write_path is not None:
	model.save_to(write_path)


	def split_partition(model, partitions, tp_size, write_path=None, megatron_legacy=False):
	if len(partitions) != 1:
	raise ValueError(
	"Can only split partitions of model with TP=1. For partitions of models with TP>1, merge first."
	)

	if tp_size < 1:
	raise ValueError("TP size must to be >= 1.")

	app_state = AppState()
	app_state.data_parallel_rank = 0
	app_state.pipeline_model_parallel_size = 1 # not supported yet in this script
	app_state.tensor_model_parallel_size = tp_size
	app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size

	app_state.tensor_model_parallel_rank = tp_size - 1

	idx = 0
	splits = []
	for param_name, param in model.named_parameters():
	if param.shape == partitions[0][idx].shape:
	split = [partitions[0][idx].data] * tp_size
	elif param.shape[0] == partitions[0][idx].shape[0]:
	split = torch.split(partitions[0][idx].data, param.shape[-1], dim=-1)
	else:
	# For T5-converted weights, the splitting needs to be strided such that q,k,v weights are bunched together on each tensor-parallel rank.
	if 'query_key_value.weight' in param_name and megatron_legacy:
	split_dim = partitions[0][idx].data.shape[0]
	if split_dim % (tp_size * 3) != 0:
	raise ValueError(
	f"Can not split Q,K,V parameter {param_name} with shape {param.shape} into tensor parallel size {tp_size}. Not divisible by {tp_size * 3}."
	)
	tp_qkv_splits = torch.chunk(partitions[0][idx].data, tp_size * 3, dim=0)
	split = []
	for i in range(tp_size):
	tp_qkv = torch.cat([tp_qkv_splits[item] for item in range(i, tp_size * 3, tp_size)])
	split.append(tp_qkv)
	elif 'key_value.weight' in param_name and megatron_legacy:
	split_dim = partitions[0][idx].data.shape[0]
	if split_dim % (tp_size * 2) != 0:
	raise ValueError(
	f"Can not split K,V parameter {param_name} with shape {param.shape} into tensor parallel size {tp_size}. Not divisible by {tp_size * 2}."
	)
	tp_qkv_splits = torch.chunk(partitions[0][idx].data, tp_size * 2, dim=0)
	split = []
	for i in range(tp_size):
	tp_qkv = torch.cat([tp_qkv_splits[item] for item in range(i, tp_size * 2, tp_size)])
	split.append(tp_qkv)
	# Regular split for Megatron and NeMo-Megatron models.
	else:
	split = torch.split(partitions[0][idx].data, param.shape[0], dim=0)
	splits.append(split)
	idx += 1

	for i in range(tp_size - 1, -1, -1):
	app_state.tensor_model_parallel_rank = i

	idx = 0
	for name, param in model.named_parameters():
	split_val = splits[idx][i].clone()

	if param.shape != split_val.shape:
	logging.info(
	f"Warning: Shape mismatch for parameter {name} required shape: {param.shape}, split shape: {split_val.shape}. Padding to match required size."
	)

	if split_val.shape[1:] == param.shape[1:]:
	pad = [0, 0] * len(split_val.shape)
	pad[-1] = param.shape[0] - split_val.shape[0]
	split_val = torch.nn.functional.pad(split_val, pad, 'constant')
	elif split_val.shape[:-1] == param.shape[:-1]:
	pad = [0, param.shape[-1] - split_val.shape[-1]]
	split_val = torch.nn.functional.pad(split_val, pad, 'constant')
	else:
	raise RuntimeError(
	f"Can not handle parameter {name}, required shape: {param.shape}, split shape: {split_val.shape}."
	)

	param.data = split_val
	idx += 1

	if write_path is not None:
	model.save_to(write_path)


	def main():
	parser = ArgumentParser()
	parser.add_argument("--model_file", type=str, required=True, help="Path to source .nemo file")
	parser.add_argument("--target_file", type=str, required=True, help="Path to write target .nemo file")
	parser.add_argument("--tensor_model_parallel_size", type=int, required=True, help="TP size of source model")
	parser.add_argument("--target_tensor_model_parallel_size", type=int, required=True, help="TP size of target model")
	parser.add_argument(
	"--model_class",
	type=str,
	default="nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel",
	help="NeMo model class. This script should support all NeMo megatron models that use Tensor Parallel",
	)
	parser.add_argument("--precision", default=16, help="PyTorch Lightning Trainer precision flag")
	parser.add_argument(
	"--megatron_legacy",
	action="store_true",
	help="Converter for legacy megatron modles that have different q,k,v weight splits",
	)
	parser.add_argument(
	"--tokenizer_model_path",
	type=str,
	required=False,
	default=None,
	help="Path to the tokenizer model path if your model uses a tokenizer model as an artifact. This is needed if your model uses a sentencepiece tokenizer.",
	)

	args = parser.parse_args()

	precision = args.precision
	if args.precision in ["32", "16"]:
	precision = int(float(args.precision))
	tp_size = args.tensor_model_parallel_size
	tgt_tp_size = args.target_tensor_model_parallel_size
	cls = model_utils.import_class_by_path(args.model_class)

	trainer = Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision)
	app_state = AppState()
	app_state.data_parallel_rank = 0
	app_state.pipeline_model_parallel_size = 1 # not supported yet in this script
	app_state.tensor_model_parallel_size = tp_size
	app_state.model_parallel_size = app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size

	if tp_size > 1:
	partitions = []
	for i in range(tp_size):
	app_state.tensor_model_parallel_rank = i
	model = cls.restore_from(restore_path=args.model_file, trainer=trainer, map_location=torch.device("cpu"))
	params = [p for _, p in model.named_parameters()]
	partitions.append(params)
	# app_state is being updated incorrectly during restore
	app_state.data_parallel_rank = 0
	app_state.pipeline_model_parallel_size = 1 # not supported yet in this script
	app_state.tensor_model_parallel_size = tp_size
	app_state.model_parallel_size = (
	app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size
	)

	model.cfg.tensor_model_parallel_size = 1
	app_state.model_parallel_size = 1
	trainer = Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision)
	if args.tokenizer_model_path is not None:
	model.cfg.tokenizer.model = args.tokenizer_model_path
	model = cls(model.cfg, trainer).to('cpu')
	model._save_restore_connector = NLPSaveRestoreConnector()

	if tgt_tp_size > 1:
	merge_partition(model, partitions)
	else:
	merge_partition(model, partitions, args.target_file)
	else:
	app_state.model_parallel_size = 1
	model = cls.restore_from(restore_path=args.model_file, trainer=trainer, map_location=torch.device("cpu"))

	if tgt_tp_size > 1:
	partitions = []
	params = [p for _, p in model.named_parameters()]
	partitions.append(params)

	model.cfg.tensor_model_parallel_size = tgt_tp_size
	app_state.model_parallel_size = tgt_tp_size
	trainer = Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision)
	if args.tokenizer_model_path is not None:
	model.cfg.tokenizer.model = args.tokenizer_model_path
	model = cls(model.cfg, trainer).to('cpu')
	model._save_restore_connector = NLPSaveRestoreConnector()
	split_partition(model, partitions, tgt_tp_size, args.target_file, args.megatron_legacy)

	logging.info("Successfully finished changing partitions!")


	if __name__ == '__main__':
	main()