| | #!/bin/bash |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | set -x -e |
| |
|
| | if [ -z "$1" ] |
| | then |
| | |
| | echo "No config file passed, quitting" |
| | exit 1 |
| | else |
| | config_file=$1 |
| | fi |
| |
|
| | source ~/.bashrc |
| | conda activate gencam |
| | cd /datasets/sai/gencam/cogvideox/training |
| |
|
| | echo "START TIME: $(date)" |
| |
|
| | |
| | export NCCL_IB_DISABLE=1 |
| | export NCCL_SOCKET_IFNAME=ens |
| |
|
| | |
| | GPUS_PER_NODE=4 |
| | |
| | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) |
| | MASTER_PORT=6000 |
| | NNODES=$SLURM_NNODES |
| | NODE_RANK=$SLURM_PROCID |
| | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) |
| |
|
| |
|
| | |
| | CMD="train_controlnet.py --config $config_file" |
| | LAUNCHER="accelerate launch \ |
| | --multi_gpu \ |
| | --gpu_ids 0,1,2,3 \ |
| | --num_processes $WORLD_SIZE \ |
| | --num_machines $NNODES \ |
| | --main_process_ip $MASTER_ADDR \ |
| | --main_process_port $MASTER_PORT \ |
| | --rdzv_backend=c10d \ |
| | --max_restarts 0 \ |
| | --tee 3 \ |
| | " |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | SRUN_ARGS=" \ |
| | --wait=60 \ |
| | --kill-on-bad-exit=1 \ |
| | " |
| |
|
| | handler() |
| | { |
| | echo "Signal handler triggered at $(date)" |
| |
|
| | sleep 120 |
| | sbatch ${BASH_SOURCE[0]} $config_file |
| | } |
| |
|
| | |
| | trap handler SIGUSR1 |
| |
|
| | clear; srun --cpu-bind=none --jobid $SLURM_JOB_ID $LAUNCHER $CMD & srun_pid=$! |
| |
|
| | wait |
| |
|
| | echo "END TIME: $(date)" |
| |
|