#!/bin/bash GPUS=4 MASTER_PORT=2948 if [ "$#" -ne 2 ]; then echo "Usage: bash train.sh " exit 1 fi OUTPUT_DIR=$1 EXP_NAME=$2 LOG_DIR="./refcoco_filter_exp" LOG_FILE="${LOG_DIR}/${EXP_NAME}.log" mkdir -p "${LOG_DIR}" MARGIN=12 TEMP=0.07 MODE=hardpos_only_sbertsim_refined MLW=0.1 BATCH_SIZE=48 MIXUP_FQ=False USE_PROJECTIONS=False # echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..." echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}" echo "Logging to: ${LOG_FILE}" ml purge ml load cuda/11.8 eval "$(conda shell.bash hook)" conda activate risall cd /data2/projects/chaeyun/CGFormer/ export NCCL_P2P_DISABLE=1 export NVIDIA_TF32_OVERRIDE=1 export NCCL_DEBUG=INFO export NCCL_TIMEOUT=7200 export NCCL_IB_RETRY_CNT=15 CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch \ --nproc_per_node=${GPUS} \ --master_port=${MASTER_PORT} \ train_rcc_sbert.py \ --config config/config_rcc_ace.yaml \ --opts TRAIN.batch_size ${BATCH_SIZE} \ TRAIN.exp_name ${EXP_NAME} \ TRAIN.output_folder ${OUTPUT_DIR} \ TRAIN.metric_mode ${MODE} \ TRAIN.metric_loss_weight ${MLW} \ TRAIN.margin_value ${MARGIN} \ TRAIN.temperature ${TEMP} \ TRAIN.filter_threshold 0.52 \ TRAIN.mixup_lasttwo ${MIXUP_FQ} \ TRAIN.use_projections ${USE_PROJECTIONS} \ > "${LOG_FILE}" 2>&1