| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
|
|
| ml purge |
| ml load cuda/11.8 |
| eval "$(conda shell.bash hook)" |
| conda activate risall |
|
|
| cd /data2/projects/chaeyun/CGFormer/ |
|
|
|
|
| export NCCL_P2P_DISABLE=1 |
| export NVIDIA_TF32_OVERRIDE=1 |
| export NCCL_DEBUG=INFO |
| export NCCL_TIMEOUT=7200 |
| export NCCL_IB_RETRY_CNT=15 |
|
|
|
|
| if [ "$#" -ne 2 ]; then |
| echo "Usage: sbatch train.sh <OUTPUT_DIR> <EXP_NAME>" |
| exit 1 |
| fi |
|
|
| |
| GPUS=6 |
| MASTER_PORT=5028 |
|
|
| |
| OUTPUT_DIR=$1 |
| EXP_NAME=$2 |
|
|
| MARGIN=12 |
| TEMP=0.07 |
| MODE=hardpos_only_sbertsim_refined |
| MLW=0.1 |
| BATCH_SIZE=36 |
|
|
| |
| |
| MIXUP_FQ=False |
| FUSE_MODE=lang_tf_attn_wope |
| FILTE_THRES=0.68 |
|
|
| echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..." |
| echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}" |
|
|
| python -m torch.distributed.launch \ |
| --nproc_per_node=${GPUS} \ |
| --master_port=${MASTER_PORT} \ |
| train_refzom_sbert.py \ |
| --config config/config_refzom_ace.yaml \ |
| --opts TRAIN.batch_size ${BATCH_SIZE} \ |
| TRAIN.exp_name ${EXP_NAME} \ |
| TRAIN.output_folder ${OUTPUT_DIR} \ |
| TRAIN.metric_mode ${MODE} \ |
| TRAIN.metric_loss_weight ${MLW} \ |
| TRAIN.margin_value ${MARGIN} \ |
| TRAIN.temperature ${TEMP} \ |
| TRAIN.filter_threshold ${FILTE_THRES} \ |
| TRAIN.mixup_lasttwo ${MIXUP_FQ} \ |
| TRAIN.fuse_mode ${FUSE_MODE} |