Upload folder using huggingface_hub

b386992 verified 9 months ago

4.89 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import argparse
	import os
	from dataclasses import dataclass
	from functools import partial

	import fiddle as fdl
	import nemo_run as run

	from nemo.collections import llm
	from nemo.collections.llm.gpt.model.llama import Llama3Config, LlamaModel
	from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs, get_results


	def get_args():
	parser = argparse.ArgumentParser()
	parser.add_argument("--model_type", type=str, choices=["llama", "bert", "t5"], help="Model type to run")
	parser.add_argument("--run_number", type=int, help="Number of config to run")
	parser.add_argument("--log_dir", type=str, help="Path where to save training logs")
	parser.add_argument("--get_results", action="store_true")
	parser.add_argument("--extra_metrics", action="store_true")

	return parser.parse_args()


	@dataclass
	class Llama3Config145M(Llama3Config):
	num_layers: int = 12
	hidden_size: int = 768
	num_attention_heads: int = 16
	num_query_groups: int = 8
	ffn_hidden_size: int = 2688


	@run.cli.factory(target=llm.pretrain, name="llama3_145m")
	def llama3_145m(num_nodes=1, num_gpus_per_node=1):
	# Setup Llama3 145M config
	recipe = partial(llm.llama3_8b.pretrain_recipe, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)()
	recipe = run.Partial(
	llm.pretrain,
	model=run.Config(LlamaModel, config=run.Config(Llama3Config145M)),
	trainer=recipe.trainer,
	data=recipe.data,
	log=recipe.log,
	optim=recipe.optim,
	resume=None,
	)

	return recipe


	def train_config(args):
	# This example will generate 3 configs.
	# It is expected that this script will be run 3 times with changing --run_number flag for each run from 1 to 3.
	# After all configurations are trained, please trigger the script using --get_results flag.
	# This script provides only Auto Configurator example using only a single GPU.

	# Get Auto Conf runner
	calculate_model_size = False
	if args.model_type == "llama":
	recipe = partial(llama3_145m)()
	recipe.data.seq_length = recipe.model.config.seq_length = 2048
	elif args.model_type == "bert":
	recipe = partial(llm.bert_110m.pretrain_recipe, num_nodes=1, num_gpus_per_node=1)()
	elif args.model_type == "t5":
	recipe = partial(llm.t5_220m.pretrain_recipe, num_nodes=1, num_gpus_per_node=1)()
	# Set to False if you don't want Auto Configurator to calculate model size
	calculate_model_size = True
	else:
	raise ValueError(f"Unsupported model type for this script: {args.model_type}")
	recipe.data.global_batch_size = 16

	runner = AutoConfigurator(
	recipe=recipe,
	gpu_memory_gb=40,
	tensor_parallel_sizes=[1],
	pipeline_parallel_sizes=[1],
	micro_batch_sizes=[1, 2, 4],
	max_training_days=1,
	max_steps_per_run=10,
	num_tokens_in_b=10,
	vocab_size=32000,
	path_to_logs=args.log_dir,
	calculate_model_size=calculate_model_size,
	)

	base_cfg, configs = generate_configs(runner)
	if not args.get_results:
	# Get generated configs
	partials = list(configs.values())
	names = list(configs.keys())

	# Run pre-training
	pretrain_cfg = partials[args.run_number - 1]
	if args.extra_metrics:
	from nemo.lightning.pytorch.callbacks import (
	MemoryMonitor,
	OptimizerMonitor,
	RuntimeEstimator,
	SpeedMonitor,
	)

	# add callbacks
	pretrain_cfg.trainer.callbacks.append(run.Config(SpeedMonitor, window_size=5))
	pretrain_cfg.trainer.callbacks.append(run.Config(RuntimeEstimator))
	pretrain_cfg.trainer.callbacks.append(run.Config(OptimizerMonitor))
	pretrain_cfg.trainer.callbacks.append(run.Config(MemoryMonitor))

	pretrain = fdl.build(pretrain_cfg)
	pretrain()
	else:
	# # Get Auto Configurator results
	get_results(base_cfg, runner, args.log_dir, log_file_prefix="nemo_error")
	print(f"The results were successfully saved to {args.log_dir}.")


	def main():
	args = get_args()
	train_config(args)


	if __name__ == '__main__':
	main()