| | """ distributed launcher adopted from torch.distributed.launch
|
| | usage example: https://github.com/facebookresearch/maskrcnn-benchmark
|
| | This enables using multiprocessing for each spawned process (as they are treated as main processes)
|
| | """
|
| | import sys
|
| | import subprocess
|
| | from argparse import ArgumentParser, REMAINDER
|
| |
|
| | from utils import str2bool, int2str
|
| |
|
| | def parse_args():
|
| | parser = ArgumentParser(description="PyTorch distributed training launch "
|
| | "helper utilty that will spawn up "
|
| | "multiple distributed processes")
|
| |
|
| |
|
| | parser.add_argument('--n_GPUs', type=int, default=1, help='the number of GPUs for training')
|
| |
|
| |
|
| | parser.add_argument("training_script", type=str,
|
| | help="The full path to the single GPU training "
|
| | "program/script to be launched in parallel, "
|
| | "followed by all the arguments for the "
|
| | "training script")
|
| |
|
| |
|
| | parser.add_argument('training_script_args', nargs=REMAINDER)
|
| | return parser.parse_args()
|
| |
|
| | def main():
|
| | args = parse_args()
|
| |
|
| | processes = []
|
| | for rank in range(0, args.n_GPUs):
|
| | cmd = [sys.executable]
|
| |
|
| | cmd.append(args.training_script)
|
| | cmd.extend(args.training_script_args)
|
| |
|
| | cmd += ['--distributed', 'True']
|
| | cmd += ['--launched', 'True']
|
| | cmd += ['--n_GPUs', str(args.n_GPUs)]
|
| | cmd += ['--rank', str(rank)]
|
| |
|
| | process = subprocess.Popen(cmd)
|
| | processes.append(process)
|
| |
|
| | for process in processes:
|
| | process.wait()
|
| | if process.returncode != 0:
|
| | raise subprocess.CalledProcessError(returncode=process.returncode,
|
| | cmd=cmd)
|
| |
|
| | if __name__ == "__main__":
|
| | main()
|
| |
|