MCNeMo / ReferFormer /scripts /dist_train_mevis.sh

Upload folder using huggingface_hub

729c925 verified over 1 year ago

1.53 kB

	#!/bin/bash
	#SBATCH --job-name=rff_mevis_swint # Job name
	#SBATCH --partition=a6000 # Partition name
	#SBATCH --gres=gpu:4 # Use 2 GPUs
	#SBATCH --time=14-04:30:00 # Time limit (14 days, 4 hours, 30 minutes)
	#SBATCH --mem=80G # Memory allocation
	#SBATCH --cpus-per-task=16 # Number of CPU cores per task
	#SBATCH --output=logs/mevis_swin_tiny_trainlog.txt # Path to output log file

	# Load necessary modules
	ml purge
	ml load cuda/11.8

	# Activate conda environment
	eval "$(conda shell.bash hook)"
	conda activate videonemo

	cd /data2/projects/chaeyun/NeMo/ReferFormer/

	# Arguments passed to the script
	OUTPUT_DIR=$1
	PRETRAINED_WEIGHTS=$2
	PY_ARGS=${@:3} # Capture any additional arguments

	# Log file path in the output directory
	LOG_FILE=${OUTPUT_DIR}/training_log_$(date +"%Y%m%d_%H%M%S").log

	echo "Loading pretrained weights from: ${PRETRAINED_WEIGHTS}"

	# Ensure the output directory exists
	mkdir -p ${OUTPUT_DIR}

	# Launch training using torch.distributed.launch with SLURM's GPU allocation
	PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
	python3 -m torch.distributed.launch --nproc_per_node=4 --use_env \
	main.py --with_box_refine --binary --freeze_text_encoder \
	--epochs 6 --lr_drop 3 5 \
	--dataset_file 'mevis' --batch_size 2 \
	--output_dir=${OUTPUT_DIR} --pretrained_weights=${PRETRAINED_WEIGHTS} ${PY_ARGS} \
	2>&1 \| tee ${LOG_FILE} # Redirect stdout and stderr to the log file

	echo "Training completed. Output and logs saved to: ${OUTPUT_DIR}"