File size: 1,308 Bytes
ea1014e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/bin/bash

GPUS=6
MASTER_PORT=7031

if [ "$#" -ne 2 ]; then
    echo "Usage: bash train.sh <OUTPUT_DIR> <EXP_NAME>"
    exit 1
fi

OUTPUT_DIR=$1
EXP_NAME=$2

LOG_DIR="./bash_logs"
LOG_FILE="${LOG_DIR}/${EXP_NAME}.log"

mkdir -p "${LOG_DIR}"

MARGIN=12
TEMP=0.07
MODE=hardpos_only_sbertsim_refined
MLW=0.1
BATCH_SIZE=30
MIXUP_FQ=False

echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..."
echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}"
echo "Logging to: ${LOG_FILE}"

ml purge
ml load cuda/11.8
eval "$(conda shell.bash hook)"
conda activate ris_all

cd /data2/projects/chaeyun/CGFormer/

export NVIDIA_TF32_OVERRIDE=1
export NCCL_DEBUG=INFO
export NCCL_IB_TIMEOUT=100
export NCCL_IB_RETRY_CNT=15

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 python -m torch.distributed.launch \
    --nproc_per_node=${GPUS} \
    --master_port=${MASTER_PORT} \
    train_gref.py \
    --config config/config_gref_ace.yaml \
    --opts TRAIN.batch_size ${BATCH_SIZE} \
           TRAIN.exp_name ${EXP_NAME} \
           TRAIN.output_folder ${OUTPUT_DIR} \
           TRAIN.metric_mode ${MODE} \
           TRAIN.metric_loss_weight ${MLW} \
           TRAIN.margin_value ${MARGIN} \
           TRAIN.temperature ${TEMP} \
           TRAIN.mixup_lasttwo ${MIXUP_FQ} \
    > "${LOG_FILE}" 2>&1