File size: 1,989 Bytes
663494c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env bash

T=`date +%m%d%H%M`

# -------------------------------------------------- #
# Usually you only need to customize these variables #
CFG=$1                                               #
GPUS=$2 
# RESUME_FROM=$3                                          
# -------------------------------------------------- #
GPUS_PER_NODE=$(($GPUS<8?$GPUS:8))
NNODES=`expr $GPUS / $GPUS_PER_NODE`

MASTER_PORT=${MASTER_PORT:-28567}
MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
RANK=${RANK:-0}

# export MASTER_ADDR=${MASTER_ADDR}
# export MASTER_PORT=${MASTER_PORT}

WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
# Intermediate files and logs will be saved to work_dirs/

if [ ! -d ${WORK_DIR}logs ]; then
    mkdir -p ${WORK_DIR}logs
fi
PYTHONPATH="/cpfs04/user/litianyu/projects/paradrive/external":$PYTHONPATH
PYTHONPATH=$PYTHONPATH:"/cpfs04/user/litianyu/projects/paradrive/external/toolbox"
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH
PYTHONPATH=$PYTHONPATH:"/cpfs01/shared/opendrivelab/sii/wangcaojun/repo-wcj/AlgEngine/navsim"

# RESUME_FROM= #"/cpfs04/user/liuhaochen/AlgEngine_nuplan/work_dirs/paradrive/navsim_openscenes_nuplan/base_e2e_hydramdp_bevformer_batch_split_scale/epoch_8.pth"

echo 'WORK_DIR: ' ${WORK_DIR}
echo 'GPUS_PER_NODE: ' ${GPUS_PER_NODE}
echo 'NNODES: ' ${NNODES}
echo 'RANK: ' ${RANK}
# echo 'RESUME_FROM: ' ${RESUME_FROM}

# export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=ALL

# export NCCL_IB_DISABLE=1
# export NCCL_SOCKET_IFNAME=eth0
# export NCCL_NET_GDR_LEVEL=0
# export NCCL_TIMEOUT=300
# export NCCL_P2P_DISABLE=1
#   --load-from ${RESUME_FROM} \

python -m torch.distributed.launch \
    --nproc_per_node=${GPUS_PER_NODE} \
    --master_addr=${MASTER_ADDR} \
    --master_port=${MASTER_PORT} \
    --nnodes=${NNODES} \
    --node_rank=${RANK} \
    $(dirname "$0")/train.py \
    $CFG \
    --launcher pytorch \
    --deterministic \
    --work-dir ${WORK_DIR} \
    --cfg-options ${@:3} \
    2>&1 | tee ${WORK_DIR}logs/train.$T