moebius / run /run_ddp_1node.sh
Mike0021's picture
Implement Moebius Gradio Space
166ab04 verified
Raw
History Blame Contribute Delete
1.89 kB
#! /bin/bash
WORK_DIR="${WORK_DIR:-.}"
############## set runtime args ###############
echo "- set runtime args ----------------"
NUM_MACHINES=${NUM_MACHINES:-1}
NUM_GPUS_PER_MACHINE=${NUM_GPUS_PER_MACHINE:-1}
NUM_PROCESSES=$((NUM_MACHINES * NUM_GPUS_PER_MACHINE))
# MAIN_PROC_IP=${MAIN_PROC_IP:-""}
# MAIN_PROC_PORT=${MAIN_PROC_PORT:-8008}
# main_ID=${main_ID:-0}
# end=$((NUM_MACHINES - 1))
# echo "doing clean proc..."
# pkill python
# pkill accelerate
# check runtime env
# echo "main node ID: $main_ID"
# echo "main node IP: $MAIN_PROC_IP"
echo "num machines: $NUM_MACHINES"
echo "num gpus/machine: $NUM_GPUS_PER_MACHINE"
echo "num gpu procs: $NUM_PROCESSES"
sleep 5s
############## set experiment args ###############
echo "- set experiment args ----------------"
CONFIG_FILE=$1
EXP_NAME=$(echo "$CONFIG_FILE" | sed -E 's|^(./)?config/||; s|\.sh$||')
# check exp name
echo "exp name: $EXP_NAME" && export EXP_NAME=$EXP_NAME
echo "exp config: $CONFIG_FILE" && source $CONFIG_FILE # get TRAIN_ARGS,OUTPUT_DIR,OUTPUT_DIR_EXP_NAME from $CONFIG_FILE
mkdir -p $OUTPUT_DIR
mkdir -p $OUTPUT_DIR_EXP_NAME
mkdir -p "$OUTPUT_DIR_EXP_NAME/log"
sleep 5s
# check exp config
PY_TRAINER=${PY_TRAINER:-"train_ddp.py"}
echo "training python script: $PY_TRAINER"
printf "TRAIN_ARGS =\n"
for arg in $TRAIN_ARGS; do
printf " %s\n" "$arg"
done
sleep 5s
############## set cmdline and launch ###############
echo "- set cmdline and launch ----------------"
ACC_CMD="accelerate launch \
--num_cpu_threads_per_process 16 \
--num_processes ${NUM_PROCESSES} \
--num_machines ${NUM_MACHINES}"
TRAIN_CMD="${PY_TRAINER} ${TRAIN_ARGS}"
# TEE_CMD=" 2>&1 | tee ${OUTPUT_DIR_EXP_NAME}/log/$(basename "$EXP_NAME")_`date "+%Y.%m.%d-%H:%M:%S"`.log"
$ACC_CMD --machine_rank 0 $TRAIN_CMD 2>&1 | tee ${OUTPUT_DIR_EXP_NAME}/log/$(basename "$EXP_NAME")_`date "+%Y.%m.%d-%H:%M:%S"`.log