|
|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ml purge |
|
|
ml load cuda/11.8 |
|
|
eval "$(conda shell.bash hook)" |
|
|
conda activate ris_all |
|
|
|
|
|
cd /data2/projects/chaeyun/CGFormer/ |
|
|
|
|
|
|
|
|
export NVIDIA_TF32_OVERRIDE=1 |
|
|
export NCCL_DEBUG=INFO |
|
|
export NCCL_IB_TIMEOUT=100 |
|
|
export NCCL_IB_RETRY_CNT=15 |
|
|
|
|
|
if [ "$#" -ne 2 ]; then |
|
|
echo "Usage: sbatch train.sh <OUTPUT_DIR> <EXP_NAME>" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
|
|
|
GPUS=2 |
|
|
MASTER_PORT=7028 |
|
|
|
|
|
|
|
|
OUTPUT_DIR=$1 |
|
|
EXP_NAME=$2 |
|
|
|
|
|
MARGIN=12 |
|
|
TEMP=0.07 |
|
|
MODE=hardpos_only_sbertsim_refined |
|
|
MLW=0.1 |
|
|
BATCH_SIZE=10 |
|
|
MIXUP_FQ=True |
|
|
|
|
|
echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..." |
|
|
echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}" |
|
|
|
|
|
|
|
|
python -m torch.distributed.launch \ |
|
|
--nproc_per_node=${GPUS} \ |
|
|
--master_port=${MASTER_PORT} \ |
|
|
train_gref.py \ |
|
|
--config config/config_gref_ace.yaml \ |
|
|
--opts TRAIN.batch_size ${BATCH_SIZE} \ |
|
|
TRAIN.exp_name ${EXP_NAME} \ |
|
|
TRAIN.output_folder ${OUTPUT_DIR} \ |
|
|
TRAIN.metric_mode ${MODE} \ |
|
|
TRAIN.metric_loss_weight ${MLW} \ |
|
|
TRAIN.margin_value ${MARGIN} \ |
|
|
TRAIN.temperature ${TEMP} \ |
|
|
TRAIN.mixup_lasttwo ${MIXUP_FQ} |