| #!/bin/bash |
|
|
| |
| |
|
|
| |
| if [ $SLURM_LOCALID -eq 0 ]; then |
| rm -rf /dev/shm/* |
| rocm-smi || true |
| else |
| sleep 2 |
| fi |
|
|
| export NCCL_SOCKET_IFNAME=hsn0,hsn1,hsn2,hsn3 |
| export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK |
| export FI_CXI_DEFAULT_CQ_SIZE=131072 |
|
|
| |
| |
| |
| |
|
|
| module --quiet purge |
| module load cray-python |
|
|
| module load CrayEnv |
| module load PrgEnv-cray/8.3.3 |
| module load craype-accel-amd-gfx90a |
| module load cray-python |
|
|
| module use /pfs/lustrep2/projappl/project_462000125/samantao-public/mymodules |
| module load suse-repo-deps/sam-default |
| module load rocm/sam-5.2.3.lua |
| module load rccl/sam-develop.lua |
| module load aws-ofi-rccl/sam-default.lua |
|
|
| source venv/bin/activate |
|
|
| MASTER_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) |
| MASTER_PORT=9999 |
|
|
| echo "Launching on $SLURMD_NODENAME ($SLURM_PROCID/$SLURM_JOB_NUM_NODES)," \ |
| "master $MASTER_NODE port $MASTER_PORT," \ |
| "GPUs $SLURM_GPUS_ON_NODE," \ |
| "CUDA: $(python -c 'import torch; print(torch.cuda.is_available())')" |
|
|
| python -u -m torch.distributed.run \ |
| --nnodes $SLURM_JOB_NUM_NODES \ |
| --nproc_per_node $SLURM_GPUS_ON_NODE \ |
| --node_rank=$SLURM_PROCID \ |
| --master_addr $MASTER_NODE \ |
| --master_port $MASTER_PORT \ |
| "$@" |
|
|