| #! /bin/bash |
| WORK_DIR="${WORK_DIR:-.}" |
|
|
|
|
| |
| echo "- set runtime args ----------------" |
| NUM_MACHINES=${NUM_MACHINES:-1} |
| NUM_GPUS_PER_MACHINE=${NUM_GPUS_PER_MACHINE:-1} |
| NUM_PROCESSES=$((NUM_MACHINES * NUM_GPUS_PER_MACHINE)) |
| |
| |
|
|
| |
| |
|
|
|
|
| |
| |
| |
|
|
|
|
| |
| |
| |
| echo "num machines: $NUM_MACHINES" |
| echo "num gpus/machine: $NUM_GPUS_PER_MACHINE" |
| echo "num gpu procs: $NUM_PROCESSES" |
| sleep 5s |
|
|
| |
| echo "- set experiment args ----------------" |
| CONFIG_FILE=$1 |
| EXP_NAME=$(echo "$CONFIG_FILE" | sed -E 's|^(./)?config/||; s|\.sh$||') |
| |
| echo "exp name: $EXP_NAME" && export EXP_NAME=$EXP_NAME |
| echo "exp config: $CONFIG_FILE" && source $CONFIG_FILE |
| mkdir -p $OUTPUT_DIR |
| mkdir -p $OUTPUT_DIR_EXP_NAME |
| mkdir -p "$OUTPUT_DIR_EXP_NAME/log" |
| sleep 5s |
|
|
| |
| PY_TRAINER=${PY_TRAINER:-"train_ddp.py"} |
| echo "training python script: $PY_TRAINER" |
| printf "TRAIN_ARGS =\n" |
| for arg in $TRAIN_ARGS; do |
| printf " %s\n" "$arg" |
| done |
| sleep 5s |
|
|
| |
| echo "- set cmdline and launch ----------------" |
| ACC_CMD="accelerate launch \ |
| --num_cpu_threads_per_process 16 \ |
| --num_processes ${NUM_PROCESSES} \ |
| --num_machines ${NUM_MACHINES}" |
| TRAIN_CMD="${PY_TRAINER} ${TRAIN_ARGS}" |
| |
|
|
|
|
|
|
| $ACC_CMD --machine_rank 0 $TRAIN_CMD 2>&1 | tee ${OUTPUT_DIR_EXP_NAME}/log/$(basename "$EXP_NAME")_`date "+%Y.%m.%d-%H:%M:%S"`.log |
|
|