dianecy
/

MRaCL

Model card Files Files and versions

Metrics Training metrics Community

MRaCL / CGFormer /scripts /train_refzom_tfattn.sh

dianecy's picture

Upload folder using huggingface_hub

ea1014e verified about 1 year ago

history blame contribute delete

1.75 kB

	#!/bin/bash
	#SBATCH --job-name=rz-cgf-ltf
	#SBATCH --partition=a6000
	#SBATCH --gres=gpu:6
	#SBATCH --time=13-11:30:00 # d-hh:mm:ss
	#SBATCH --mem=80000 # CPU memory size
	#SBATCH --cpus-per-task=12 # Number of CPU cores
	#SBATCH --output=./refzom_filter_exp/filter_fuse_tfattn_th07_bs36.log

	ml purge
	ml load cuda/11.8
	eval "$(conda shell.bash hook)"
	conda activate risall

	cd /data2/projects/chaeyun/CGFormer/


	export NCCL_P2P_DISABLE=1
	export NVIDIA_TF32_OVERRIDE=1
	export NCCL_DEBUG=INFO
	export NCCL_TIMEOUT=7200
	export NCCL_IB_RETRY_CNT=15


	if [ "$#" -ne 2 ]; then
	echo "Usage: sbatch train.sh <OUTPUT_DIR> <EXP_NAME>"
	exit 1
	fi

	# to change
	GPUS=6
	MASTER_PORT=5028

	# input
	OUTPUT_DIR=$1
	EXP_NAME=$2

	MARGIN=12
	TEMP=0.07
	MODE=hardpos_only_sbertsim_refined
	MLW=0.1
	BATCH_SIZE=36

	# sbatch scripts/train_refzom_tfattn.sh exp_sanity/refzom filter_fuse_tfattn_wope_bs36
	# self.fuse_mode == 'lang_tf_attn' and not self.mixup_lasttwo
	MIXUP_FQ=False
	FUSE_MODE=lang_tf_attn_wope
	FILTE_THRES=0.68

	echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..."
	echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}"

	python -m torch.distributed.launch \
	--nproc_per_node=${GPUS} \
	--master_port=${MASTER_PORT} \
	train_refzom_sbert.py \
	--config config/config_refzom_ace.yaml \
	--opts TRAIN.batch_size ${BATCH_SIZE} \
	TRAIN.exp_name ${EXP_NAME} \
	TRAIN.output_folder ${OUTPUT_DIR} \
	TRAIN.metric_mode ${MODE} \
	TRAIN.metric_loss_weight ${MLW} \
	TRAIN.margin_value ${MARGIN} \
	TRAIN.temperature ${TEMP} \
	TRAIN.filter_threshold ${FILTE_THRES} \
	TRAIN.mixup_lasttwo ${MIXUP_FQ} \
	TRAIN.fuse_mode ${FUSE_MODE}