#!/bin/bash #SBATCH --job-name=rz-cgf-sa #SBATCH --partition=a6000 #SBATCH --gres=gpu:4 #SBATCH --time=13-11:30:00 # d-hh:mm:ss #SBATCH --mem=80000 # CPU memory size #SBATCH --cpus-per-task=12 # Number of CPU cores #SBATCH --output=./refzom_filter_exp/filter_fuse_simpleattn_th07_bs24.log ml purge ml load cuda/11.8 eval "$(conda shell.bash hook)" conda activate risall cd /data2/projects/chaeyun/CGFormer/ export NCCL_P2P_DISABLE=1 export NVIDIA_TF32_OVERRIDE=1 export NCCL_DEBUG=INFO export NCCL_TIMEOUT=7200 export NCCL_IB_RETRY_CNT=15 if [ "$#" -ne 2 ]; then echo "Usage: sbatch train.sh " exit 1 fi # to change GPUS=4 MASTER_PORT=9522 # input OUTPUT_DIR=$1 EXP_NAME=$2 MARGIN=12 TEMP=0.07 MODE=hardpos_only_sbertsim_refined MLW=0.1 BATCH_SIZE=24 # sbatch scripts/train_refzom_simpleattn.sh exp_sanity/refzom filter_fuse_simpleattn_th07_bs24 # self.fuse_mode == 'simple_attn' and not self.mixup_lasttwo MIXUP_FQ=False FUSE_MODE=simple_attn FILTE_THRES=0.68 echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..." echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}" python -m torch.distributed.launch \ --nproc_per_node=${GPUS} \ --master_port=${MASTER_PORT} \ train_refzom_sbert.py \ --config config/config_refzom_ace.yaml \ --opts TRAIN.batch_size ${BATCH_SIZE} \ TRAIN.exp_name ${EXP_NAME} \ TRAIN.output_folder ${OUTPUT_DIR} \ TRAIN.metric_mode ${MODE} \ TRAIN.metric_loss_weight ${MLW} \ TRAIN.margin_value ${MARGIN} \ TRAIN.temperature ${TEMP} \ TRAIN.filter_threshold ${FILTE_THRES} \ TRAIN.mixup_lasttwo ${MIXUP_FQ} \ TRAIN.fuse_mode ${FUSE_MODE}