File size: 1,461 Bytes
ea1014e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
#!/bin/bash
GPUS=4
MASTER_PORT=2948
if [ "$#" -ne 2 ]; then
echo "Usage: bash train.sh <OUTPUT_DIR> <EXP_NAME>"
exit 1
fi
OUTPUT_DIR=$1
EXP_NAME=$2
LOG_DIR="./refcoco_filter_exp"
LOG_FILE="${LOG_DIR}/${EXP_NAME}.log"
mkdir -p "${LOG_DIR}"
MARGIN=12
TEMP=0.07
MODE=hardpos_only_sbertsim_refined
MLW=0.1
BATCH_SIZE=48
MIXUP_FQ=False
USE_PROJECTIONS=False
#
echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..."
echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}"
echo "Logging to: ${LOG_FILE}"
ml purge
ml load cuda/11.8
eval "$(conda shell.bash hook)"
conda activate risall
cd /data2/projects/chaeyun/CGFormer/
export NCCL_P2P_DISABLE=1
export NVIDIA_TF32_OVERRIDE=1
export NCCL_DEBUG=INFO
export NCCL_TIMEOUT=7200
export NCCL_IB_RETRY_CNT=15
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
train_rcc_sbert.py \
--config config/config_rcc_ace.yaml \
--opts TRAIN.batch_size ${BATCH_SIZE} \
TRAIN.exp_name ${EXP_NAME} \
TRAIN.output_folder ${OUTPUT_DIR} \
TRAIN.metric_mode ${MODE} \
TRAIN.metric_loss_weight ${MLW} \
TRAIN.margin_value ${MARGIN} \
TRAIN.temperature ${TEMP} \
TRAIN.filter_threshold 0.52 \
TRAIN.mixup_lasttwo ${MIXUP_FQ} \
TRAIN.use_projections ${USE_PROJECTIONS} \
> "${LOG_FILE}" 2>&1
|