MCNeMo / ReferFormer /scripts /dist_train_mevis.sh
dianecy's picture
Upload folder using huggingface_hub
729c925 verified
#!/bin/bash
#SBATCH --job-name=rff_mevis_swint # Job name
#SBATCH --partition=a6000 # Partition name
#SBATCH --gres=gpu:4 # Use 2 GPUs
#SBATCH --time=14-04:30:00 # Time limit (14 days, 4 hours, 30 minutes)
#SBATCH --mem=80G # Memory allocation
#SBATCH --cpus-per-task=16 # Number of CPU cores per task
#SBATCH --output=logs/mevis_swin_tiny_trainlog.txt # Path to output log file
# Load necessary modules
ml purge
ml load cuda/11.8
# Activate conda environment
eval "$(conda shell.bash hook)"
conda activate videonemo
cd /data2/projects/chaeyun/NeMo/ReferFormer/
# Arguments passed to the script
OUTPUT_DIR=$1
PRETRAINED_WEIGHTS=$2
PY_ARGS=${@:3} # Capture any additional arguments
# Log file path in the output directory
LOG_FILE=${OUTPUT_DIR}/training_log_$(date +"%Y%m%d_%H%M%S").log
echo "Loading pretrained weights from: ${PRETRAINED_WEIGHTS}"
# Ensure the output directory exists
mkdir -p ${OUTPUT_DIR}
# Launch training using torch.distributed.launch with SLURM's GPU allocation
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python3 -m torch.distributed.launch --nproc_per_node=4 --use_env \
main.py --with_box_refine --binary --freeze_text_encoder \
--epochs 6 --lr_drop 3 5 \
--dataset_file 'mevis' --batch_size 2 \
--output_dir=${OUTPUT_DIR} --pretrained_weights=${PRETRAINED_WEIGHTS} ${PY_ARGS} \
2>&1 | tee ${LOG_FILE} # Redirect stdout and stderr to the log file
echo "Training completed. Output and logs saved to: ${OUTPUT_DIR}"