| #SBATCH --job-name=rff_mevis_swint # Job name | |
| #SBATCH --partition=a6000 # Partition name | |
| #SBATCH --gres=gpu:4 # Use 2 GPUs | |
| #SBATCH --time=14-04:30:00 # Time limit (14 days, 4 hours, 30 minutes) | |
| #SBATCH --mem=80G # Memory allocation | |
| #SBATCH --cpus-per-task=16 # Number of CPU cores per task | |
| #SBATCH --output=logs/mevis_swin_tiny_trainlog.txt # Path to output log file | |
| # Load necessary modules | |
| ml purge | |
| ml load cuda/11.8 | |
| # Activate conda environment | |
| eval "$(conda shell.bash hook)" | |
| conda activate videonemo | |
| cd /data2/projects/chaeyun/NeMo/ReferFormer/ | |
| # Arguments passed to the script | |
| OUTPUT_DIR=$1 | |
| PRETRAINED_WEIGHTS=$2 | |
| PY_ARGS=${@:3} # Capture any additional arguments | |
| # Log file path in the output directory | |
| LOG_FILE=${OUTPUT_DIR}/training_log_$(date +"%Y%m%d_%H%M%S").log | |
| echo "Loading pretrained weights from: ${PRETRAINED_WEIGHTS}" | |
| # Ensure the output directory exists | |
| mkdir -p ${OUTPUT_DIR} | |
| # Launch training using torch.distributed.launch with SLURM's GPU allocation | |
| PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ | |
| python3 -m torch.distributed.launch --nproc_per_node=4 --use_env \ | |
| main.py --with_box_refine --binary --freeze_text_encoder \ | |
| --epochs 6 --lr_drop 3 5 \ | |
| --dataset_file 'mevis' --batch_size 2 \ | |
| --output_dir=${OUTPUT_DIR} --pretrained_weights=${PRETRAINED_WEIGHTS} ${PY_ARGS} \ | |
| 2>&1 | tee ${LOG_FILE} # Redirect stdout and stderr to the log file | |
| echo "Training completed. Output and logs saved to: ${OUTPUT_DIR}" | |