| pdsh -w 51006,51007,51008,51009 ' | |
| if [[ $(hostname) == "node0" ]]; then | |
| export RANK=0 | |
| else | |
| export RANK=1 | |
| fi | |
| export LOCAL_RANK=0 | |
| export MASTER_ADDR=10.0.0.1 | |
| export MASTER_PORT=29500 | |
| export WORLD_SIZE=2 | |
| export NCCL_DEBUG=INFO | |
| export NCCL_IB_DISABLE=1 | |
| python - << EOF | |
| import torch.distributed as dist | |
| dist.init_process_group("nccl") | |
| print("OK", dist.get_rank(), dist.get_world_size(), flush=True) | |
| EOF | |
| ' | |
| ' | |