#!/bin/bash #SBATCH --job-name=rff_mevis_swint # Job name #SBATCH --partition=a6000 # Partition name #SBATCH --gres=gpu:4 # Use 2 GPUs #SBATCH --time=14-04:30:00 # Time limit (14 days, 4 hours, 30 minutes) #SBATCH --mem=80G # Memory allocation #SBATCH --cpus-per-task=16 # Number of CPU cores per task #SBATCH --output=logs/mevis_swin_tiny_trainlog.txt # Path to output log file # Load necessary modules ml purge ml load cuda/11.8 # Activate conda environment eval "$(conda shell.bash hook)" conda activate videonemo cd /data2/projects/chaeyun/NeMo/ReferFormer/ # Arguments passed to the script OUTPUT_DIR=$1 PRETRAINED_WEIGHTS=$2 PY_ARGS=${@:3} # Capture any additional arguments # Log file path in the output directory LOG_FILE=${OUTPUT_DIR}/training_log_$(date +"%Y%m%d_%H%M%S").log echo "Loading pretrained weights from: ${PRETRAINED_WEIGHTS}" # Ensure the output directory exists mkdir -p ${OUTPUT_DIR} # Launch training using torch.distributed.launch with SLURM's GPU allocation PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python3 -m torch.distributed.launch --nproc_per_node=4 --use_env \ main.py --with_box_refine --binary --freeze_text_encoder \ --epochs 6 --lr_drop 3 5 \ --dataset_file 'mevis' --batch_size 2 \ --output_dir=${OUTPUT_DIR} --pretrained_weights=${PRETRAINED_WEIGHTS} ${PY_ARGS} \ 2>&1 | tee ${LOG_FILE} # Redirect stdout and stderr to the log file echo "Training completed. Output and logs saved to: ${OUTPUT_DIR}"