File size: 1,922 Bytes
663494c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
#!/usr/bin/env bash
T=`date +%m%d%H%M`
# -------------------------------------------------- #
# Usually you only need to customize these variables #
CFG=$1 #
GPUS=$2
RESUME_FROM=${3:-None} # Default to empty if not provided
# -------------------------------------------------- #
GPUS_PER_NODE=$(($GPUS<8?$GPUS:8))
NNODES=${WORLD_SIZE:-`expr $GPUS / $GPUS_PER_NODE`}
MASTER_PORT=${MASTER_PORT:-28567}
MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
export MASTER_ADDR=${MASTER_ADDR}
export MASTER_PORT=${MASTER_PORT}
RANK=${RANK:-0}
WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
# Intermediate files and logs will be saved to work_dirs/
if [ ! -d ${WORK_DIR}logs ]; then
mkdir -p ${WORK_DIR}logs
fi
export PYTHONPATH="$(realpath "$(dirname $0)/..")":"$(realpath "$(dirname $0)/../navsim")":$PYTHONPATH
export OMP_NUM_THREADS=1
echo 'WORK_DIR: ' ${WORK_DIR}
echo 'GPUS_PER_NODE: ' ${GPUS_PER_NODE}
echo 'NNODES: ' ${NNODES}
echo 'RANK: ' ${RANK}
echo 'PYTHONPATH: ' ${PYTHONPATH}
if [[ "$RESUME_FROM" != "None" && -n "$RESUME_FROM" ]]; then
RESUME_ARG="--resume-from $RESUME_FROM"
else
RESUME_ARG=""
fi
# python -m torch.distributed.launch \
# --nproc_per_node=${GPUS_PER_NODE} \
# --master_addr=${MASTER_ADDR} \
# --master_port=${MASTER_PORT} \
# --nnodes=${NNODES} \
# --node_rank=${RANK} \
# $(dirname "$0")/train.py \
# $CFG \
# --launcher pytorch ${@:4} \
# # --deterministic \
# --work-dir ${WORK_DIR} \
# 2>&1 | tee ${WORK_DIR}logs/train.$T
torchrun \
--nnodes=${NNODES} \
--nproc_per_node=${GPUS_PER_NODE} \
--node_rank=${RANK} \
--master_addr=${MASTER_ADDR} \
--master_port=${MASTER_PORT} \
$(dirname "$0")/train.py \
$CFG \
--launcher pytorch $RESUME_ARG \
--work-dir ${WORK_DIR} \
2>&1 | tee ${WORK_DIR}logs/train.$T
|