| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
| source ~/.bashrc |
| conda activate gencam |
| cd /datasets/sai/gencam/cogvideox/training |
| export CUDA_VISIBLE_DEVICES=0,1,2,3 |
| export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True |
|
|
| if [ -z "$1" ] |
| then |
| |
| echo "No config file passed, quitting" |
| exit 1 |
| else |
| config_file=$1 |
| fi |
|
|
| handler() |
| { |
| echo "function handler called at $(date)" |
| |
| if [ -n "$accelerate_pid" ]; then |
| echo "Sending SIGUSR1 to accelerate PID: $accelerate_pid" |
| python_id=$(ps --ppid $accelerate_pid -o pid=) |
| kill -USR1 $python_id |
| sleep 300 |
| else |
| echo "No accelerate PID found" |
| fi |
| echo "Resubmitting job with config file: $config_file" |
| sbatch ${BASH_SOURCE[0]} $config_file |
| } |
|
|
| |
| trap handler SIGUSR1 |
|
|
| echo "Starting job at $(date)" |
| |
| accelerate launch --config_file accelerator_configs/accelerator_train_config.yaml --multi_gpu train_controlnet.py --config $config_file & |
| accelerate_pid=$! |
|
|
| wait |