|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Optional |
|
|
|
|
|
import nemo_run as run |
|
|
|
|
|
from nemo.collections import llm |
|
|
|
|
|
|
|
|
def local_executor_torchrun(devices: int = 2) -> run.LocalExecutor: |
|
|
env_vars = { |
|
|
"TRANSFORMERS_OFFLINE": "1", |
|
|
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", |
|
|
"NCCL_NVLS_ENABLE": "0", |
|
|
} |
|
|
|
|
|
executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) |
|
|
|
|
|
return executor |
|
|
|
|
|
|
|
|
def slurm_executor( |
|
|
user: str, |
|
|
host: str, |
|
|
remote_job_dir: str, |
|
|
account: str, |
|
|
partition: str, |
|
|
nodes: int, |
|
|
devices: int, |
|
|
time: str = "01:00:00", |
|
|
custom_mounts: Optional[list[str]] = None, |
|
|
custom_env_vars: Optional[dict[str, str]] = None, |
|
|
container_image: str = "nvcr.io/nvidia/nemo:dev", |
|
|
retries: int = 0, |
|
|
) -> run.SlurmExecutor: |
|
|
if not (user and host and remote_job_dir and account and partition and nodes and devices): |
|
|
raise RuntimeError( |
|
|
"Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function." |
|
|
) |
|
|
|
|
|
mounts = [] |
|
|
if custom_mounts: |
|
|
mounts.extend(custom_mounts) |
|
|
|
|
|
env_vars = { |
|
|
"TRANSFORMERS_OFFLINE": "1", |
|
|
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", |
|
|
"NCCL_NVLS_ENABLE": "0", |
|
|
} |
|
|
if custom_env_vars: |
|
|
env_vars |= custom_env_vars |
|
|
|
|
|
executor = run.SlurmExecutor( |
|
|
account=account, |
|
|
partition=partition, |
|
|
tunnel=run.SSHTunnel( |
|
|
user=user, |
|
|
host=host, |
|
|
job_dir=remote_job_dir, |
|
|
), |
|
|
nodes=nodes, |
|
|
ntasks_per_node=devices, |
|
|
gpus_per_node=devices, |
|
|
mem="0", |
|
|
exclusive=True, |
|
|
gres="gpu:8", |
|
|
packager=run.GitArchivePackager(subpath="examples/llm/run"), |
|
|
) |
|
|
|
|
|
executor.container_image = container_image |
|
|
executor.container_mounts = mounts |
|
|
executor.env_vars = env_vars |
|
|
executor.retries = retries |
|
|
executor.time = time |
|
|
|
|
|
return executor |
|
|
|
|
|
|
|
|
def my_slurm_executor(): |
|
|
|
|
|
return slurm_executor( |
|
|
user="", |
|
|
host="", |
|
|
remote_job_dir="", |
|
|
account="", |
|
|
partition="", |
|
|
nodes=1, |
|
|
devices=2, |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
run.cli.main(llm.pretrain, default_executor=local_executor_torchrun) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|