#!/usr/bin/env bash T=`date +%m%d%H%M` # -------------------------------------------------- # # Usually you only need to customize these variables # CFG=$1 # GPUS=$2 # RESUME_FROM=$3 # -------------------------------------------------- # GPUS_PER_NODE=$(($GPUS<8?$GPUS:8)) NNODES=`expr $GPUS / $GPUS_PER_NODE` MASTER_PORT=${MASTER_PORT:-28567} MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} RANK=${RANK:-0} # export MASTER_ADDR=${MASTER_ADDR} # export MASTER_PORT=${MASTER_PORT} WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/ # Intermediate files and logs will be saved to work_dirs/ if [ ! -d ${WORK_DIR}logs ]; then mkdir -p ${WORK_DIR}logs fi PYTHONPATH="/cpfs04/user/litianyu/projects/paradrive/external":$PYTHONPATH PYTHONPATH=$PYTHONPATH:"/cpfs04/user/litianyu/projects/paradrive/external/toolbox" PYTHONPATH="$(dirname $0)/..":$PYTHONPATH PYTHONPATH=$PYTHONPATH:"/cpfs01/shared/opendrivelab/sii/wangcaojun/repo-wcj/AlgEngine/navsim" # RESUME_FROM= #"/cpfs04/user/liuhaochen/AlgEngine_nuplan/work_dirs/paradrive/navsim_openscenes_nuplan/base_e2e_hydramdp_bevformer_batch_split_scale/epoch_8.pth" echo 'WORK_DIR: ' ${WORK_DIR} echo 'GPUS_PER_NODE: ' ${GPUS_PER_NODE} echo 'NNODES: ' ${NNODES} echo 'RANK: ' ${RANK} # echo 'RESUME_FROM: ' ${RESUME_FROM} # export NCCL_DEBUG=INFO # export NCCL_DEBUG_SUBSYS=ALL # export NCCL_IB_DISABLE=1 # export NCCL_SOCKET_IFNAME=eth0 # export NCCL_NET_GDR_LEVEL=0 # export NCCL_TIMEOUT=300 # export NCCL_P2P_DISABLE=1 # --load-from ${RESUME_FROM} \ python -m torch.distributed.launch \ --nproc_per_node=${GPUS_PER_NODE} \ --master_addr=${MASTER_ADDR} \ --master_port=${MASTER_PORT} \ --nnodes=${NNODES} \ --node_rank=${RANK} \ $(dirname "$0")/train.py \ $CFG \ --launcher pytorch \ --deterministic \ --work-dir ${WORK_DIR} \ --cfg-options ${@:3} \ 2>&1 | tee ${WORK_DIR}logs/train.$T