#!/bin/bash #SBATCH --job-name=cgf-th07 #SBATCH --partition=a6000 #SBATCH --gres=gpu:4 #SBATCH --time=13-11:30:00 # d-hh:mm:ss #SBATCH --mem=52000 # CPU memory size #SBATCH --cpus-per-task=8 # Number of CPU cores #SBATCH --output=./refcoco+_filter_exp/filter_noproj_thr07_bs48.log ml purge ml load cuda/11.8 eval "$(conda shell.bash hook)" conda activate risall cd /data2/projects/chaeyun/CGFormer/ export NCCL_P2P_DISABLE=1 export NVIDIA_TF32_OVERRIDE=1 export NCCL_DEBUG=INFO export NCCL_TIMEOUT=7200 export NCCL_IB_RETRY_CNT=15 if [ "$#" -ne 2 ]; then echo "Usage: sbatch train.sh " exit 1 fi # to change GPUS=4 MASTER_PORT=4018 # input OUTPUT_DIR=$1 EXP_NAME=$2 MARGIN=12 TEMP=0.07 MODE=hardpos_only_sbertsim_refined MLW=0.1 BATCH_SIZE=48 MIXUP_FQ=False USE_PROJECTIONS=False FTHRES=0.68 echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..." echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}" python -m torch.distributed.launch \ --nproc_per_node=${GPUS} \ --master_port=${MASTER_PORT} \ train_rcc_sbert.py \ --config config/config_rccp_ace.yaml \ --opts TRAIN.batch_size ${BATCH_SIZE} \ TRAIN.exp_name ${EXP_NAME} \ TRAIN.output_folder ${OUTPUT_DIR} \ TRAIN.metric_mode ${MODE} \ TRAIN.metric_loss_weight ${MLW} \ TRAIN.margin_value ${MARGIN} \ TRAIN.temperature ${TEMP} \ TRAIN.filter_threshold ${FTHRES} \ TRAIN.mixup_lasttwo ${MIXUP_FQ} \ TRAIN.use_projections ${USE_PROJECTIONS} # sbatch train_rcc+.sh exp_sanity/refcoco+ filter_noproj_thr07_bs48 # change FTHRES to 0.52 # sbatch train_rcc+.sh exp_sanity/refcoco+ filter_noproj_thr05_bs48 # bash train_rcc_bash.sh exp_sanity/refcoco filter_noproj_thr05_bs48