|
|
#!/bin/bash |
|
|
|
|
|
GPUS=4 |
|
|
MASTER_PORT=2948 |
|
|
|
|
|
if [ "$#" -ne 2 ]; then |
|
|
echo "Usage: bash train.sh <OUTPUT_DIR> <EXP_NAME>" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
OUTPUT_DIR=$1 |
|
|
EXP_NAME=$2 |
|
|
|
|
|
LOG_DIR="./refcoco_filter_exp" |
|
|
LOG_FILE="${LOG_DIR}/${EXP_NAME}.log" |
|
|
|
|
|
mkdir -p "${LOG_DIR}" |
|
|
|
|
|
MARGIN=12 |
|
|
TEMP=0.07 |
|
|
MODE=hardpos_only_sbertsim_refined |
|
|
MLW=0.1 |
|
|
BATCH_SIZE=48 |
|
|
MIXUP_FQ=False |
|
|
USE_PROJECTIONS=False |
|
|
|
|
|
|
|
|
|
|
|
echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..." |
|
|
echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}" |
|
|
echo "Logging to: ${LOG_FILE}" |
|
|
|
|
|
ml purge |
|
|
ml load cuda/11.8 |
|
|
eval "$(conda shell.bash hook)" |
|
|
conda activate risall |
|
|
|
|
|
cd /data2/projects/chaeyun/CGFormer/ |
|
|
|
|
|
export NCCL_P2P_DISABLE=1 |
|
|
export NVIDIA_TF32_OVERRIDE=1 |
|
|
export NCCL_DEBUG=INFO |
|
|
export NCCL_TIMEOUT=7200 |
|
|
export NCCL_IB_RETRY_CNT=15 |
|
|
|
|
|
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch \ |
|
|
--nproc_per_node=${GPUS} \ |
|
|
--master_port=${MASTER_PORT} \ |
|
|
train_rcc_sbert.py \ |
|
|
--config config/config_rcc_ace.yaml \ |
|
|
--opts TRAIN.batch_size ${BATCH_SIZE} \ |
|
|
TRAIN.exp_name ${EXP_NAME} \ |
|
|
TRAIN.output_folder ${OUTPUT_DIR} \ |
|
|
TRAIN.metric_mode ${MODE} \ |
|
|
TRAIN.metric_loss_weight ${MLW} \ |
|
|
TRAIN.margin_value ${MARGIN} \ |
|
|
TRAIN.temperature ${TEMP} \ |
|
|
TRAIN.filter_threshold 0.52 \ |
|
|
TRAIN.mixup_lasttwo ${MIXUP_FQ} \ |
|
|
TRAIN.use_projections ${USE_PROJECTIONS} \ |
|
|
> "${LOG_FILE}" 2>&1 |
|
|
|