#!/bin/bash #SBATCH --job-name=sanity_check #SBATCH --partition=a6000 #SBATCH --nodelist=node07 #SBATCH --gres=gpu:4 #SBATCH --time=13-11:30:00 # d-hh:mm:ss #SBATCH --mem=52000 # CPU memory size #SBATCH --cpus-per-task=8 # Number of CPU cores #SBATCH --output=./refcoco_filter_exp/filter_noproj_thr07_bs48.log ml purge ml load cuda/11.8 eval "$(conda shell.bash hook)" conda activate risall cd /data2/projects/chaeyun/CGFormer/ export NCCL_P2P_DISABLE=1 export NVIDIA_TF32_OVERRIDE=1 export NCCL_DEBUG=INFO export NCCL_TIMEOUT=7200 export NCCL_IB_RETRY_CNT=15 if [ "$#" -ne 2 ]; then echo "Usage: sbatch train.sh " exit 1 fi # to change GPUS=4 MASTER_PORT=2384 # input OUTPUT_DIR=$1 EXP_NAME=$2 MARGIN=12 TEMP=0.07 MODE=hardpos_only_sbertsim_refined MLW=0.1 BATCH_SIZE=48 MIXUP_FQ=False USE_PROJECTIONS=False echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..." echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}" python -m torch.distributed.launch \ --nproc_per_node=${GPUS} \ --master_port=${MASTER_PORT} \ train_rcc_sbert.py \ --config config/config_rcc_ace.yaml \ --opts TRAIN.batch_size ${BATCH_SIZE} \ TRAIN.exp_name ${EXP_NAME} \ TRAIN.output_folder ${OUTPUT_DIR} \ TRAIN.metric_mode ${MODE} \ TRAIN.metric_loss_weight ${MLW} \ TRAIN.margin_value ${MARGIN} \ TRAIN.temperature ${TEMP} \ TRAIN.mixup_lasttwo ${MIXUP_FQ} \ TRAIN.use_projections ${USE_PROJECTIONS}