NeMo_Canary / examples /llm /pretrain /default_executor.py

Upload folder using huggingface_hub

b386992 verified 6 months ago

3.72 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from typing import Optional

	import nemo_run as run

	from nemo.collections import llm


	def local_executor_torchrun(devices: int = 2) -> run.LocalExecutor:
	env_vars = {
	"TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace
	"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
	"NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory
	}

	executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)

	return executor


	def slurm_executor(
	user: str,
	host: str,
	remote_job_dir: str,
	account: str,
	partition: str,
	nodes: int,
	devices: int,
	time: str = "01:00:00",
	custom_mounts: Optional[list[str]] = None,
	custom_env_vars: Optional[dict[str, str]] = None,
	container_image: str = "nvcr.io/nvidia/nemo:dev",
	retries: int = 0,
	) -> run.SlurmExecutor:
	if not (user and host and remote_job_dir and account and partition and nodes and devices):
	raise RuntimeError(
	"Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function."
	)

	mounts = []
	if custom_mounts:
	mounts.extend(custom_mounts)

	env_vars = {
	"TRANSFORMERS_OFFLINE": "1", # Enable online downloads from HuggingFace
	"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
	"NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory
	}
	if custom_env_vars:
	env_vars \|= custom_env_vars

	executor = run.SlurmExecutor(
	account=account,
	partition=partition,
	tunnel=run.SSHTunnel(
	user=user,
	host=host,
	job_dir=remote_job_dir,
	),
	nodes=nodes,
	ntasks_per_node=devices,
	gpus_per_node=devices,
	mem="0",
	exclusive=True,
	gres="gpu:8",
	packager=run.GitArchivePackager(subpath="examples/llm/run"),
	)

	executor.container_image = container_image
	executor.container_mounts = mounts
	executor.env_vars = env_vars
	executor.retries = retries
	executor.time = time

	return executor


	def my_slurm_executor():
	# TODO: Set your custom parameters for the Slurm Executor.
	return slurm_executor(
	user="",
	host="",
	remote_job_dir="",
	account="",
	partition="",
	nodes=1,
	devices=2,
	)


	if __name__ == "__main__":
	run.cli.main(llm.pretrain, default_executor=local_executor_torchrun)

	# This will re-expose the pretrain entrypoint with your custom local executor as default.

	# To run, for instance, the llama3_8b recipe, use the following command:
	# python default_executor.py --factory llama3_8b

	# To run with any overrides, use the following command:
	# python default_executor.py --factory llama3_8b trainer.max_steps=2000

	# To use your custom Slurm executor, use the following command:
	# python default_executor.py --executor my_slurm_executor --factory llama3_8b