dianecy
/

MRaCL

Model card Files Files and versions

Metrics Training metrics Community

MRaCL / CGFormer /scripts /train_gref.sh

dianecy's picture

Upload folder using huggingface_hub

ea1014e verified 10 months ago

history blame contribute delete

1.4 kB

	#!/bin/bash
	#SBATCH --job-name=sanity_check
	#SBATCH --partition=a4000
	#SBATCH --gres=gpu:2
	#SBATCH --time=13-11:30:00 # d-hh:mm:ss
	#SBATCH --mem=52000 # CPU memory size
	#SBATCH --cpus-per-task=8 # Number of CPU cores
	#SBATCH --output=./filter_exp/sanity.log

	ml purge
	ml load cuda/11.8
	eval "$(conda shell.bash hook)"
	conda activate ris_all

	cd /data2/projects/chaeyun/CGFormer/


	export NVIDIA_TF32_OVERRIDE=1
	export NCCL_DEBUG=INFO
	export NCCL_IB_TIMEOUT=100
	export NCCL_IB_RETRY_CNT=15

	if [ "$#" -ne 2 ]; then
	echo "Usage: sbatch train.sh <OUTPUT_DIR> <EXP_NAME>"
	exit 1
	fi

	# to change
	GPUS=2
	MASTER_PORT=7028

	# input
	OUTPUT_DIR=$1
	EXP_NAME=$2

	MARGIN=12
	TEMP=0.07
	MODE=hardpos_only_sbertsim_refined
	MLW=0.1
	BATCH_SIZE=10
	MIXUP_FQ=True

	echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..."
	echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}"


	python -m torch.distributed.launch \
	--nproc_per_node=${GPUS} \
	--master_port=${MASTER_PORT} \
	train_gref.py \
	--config config/config_gref_ace.yaml \
	--opts TRAIN.batch_size ${BATCH_SIZE} \
	TRAIN.exp_name ${EXP_NAME} \
	TRAIN.output_folder ${OUTPUT_DIR} \
	TRAIN.metric_mode ${MODE} \
	TRAIN.metric_loss_weight ${MLW} \
	TRAIN.margin_value ${MARGIN} \
	TRAIN.temperature ${TEMP} \
	TRAIN.mixup_lasttwo ${MIXUP_FQ}