| #SBATCH --job-name=sanity_check | |
| #SBATCH --partition=a6000 | |
| #SBATCH --gres=gpu:4 | |
| #SBATCH --nodelist=node03 | |
| #SBATCH --time=13-11:30:00 # d-hh:mm:ss | |
| #SBATCH --mem=80000 # CPU memory size | |
| #SBATCH --cpus-per-task=12 # Number of CPU cores | |
| #SBATCH --output=./filter_exp/refzom-sanity-vargatherv2.log | |
| #sbatch train_refzom.sh exp_sanity/refzom repro48_vargather_v2node03 | |
| ml purge | |
| ml load cuda/11.8 | |
| eval "$(conda shell.bash hook)" | |
| conda activate risall | |
| cd /data2/projects/chaeyun/CGFormer/ | |
| export NCCL_P2P_DISABLE=1 | |
| export NVIDIA_TF32_OVERRIDE=1 | |
| export NCCL_DEBUG=INFO | |
| export NCCL_TIMEOUT=7200 | |
| export NCCL_IB_RETRY_CNT=15 | |
| if [ "$#" -ne 2 ]; then | |
| echo "Usage: sbatch train.sh <OUTPUT_DIR> <EXP_NAME>" | |
| exit 1 | |
| fi | |
| # to change | |
| GPUS=4 | |
| MASTER_PORT=2938 | |
| # input | |
| OUTPUT_DIR=$1 | |
| EXP_NAME=$2 | |
| MARGIN=12 | |
| TEMP=0.07 | |
| MODE=repro_original | |
| MLW=0.1 | |
| BATCH_SIZE=48 | |
| MIXUP_FQ=True | |
| echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..." | |
| echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}" | |
| python -m torch.distributed.launch \ | |
| --nproc_per_node=${GPUS} \ | |
| --master_port=${MASTER_PORT} \ | |
| train_refzom_repro_2.py \ | |
| --config config/config_refzom_repro.yaml \ | |
| --opts TRAIN.batch_size ${BATCH_SIZE} \ | |
| TRAIN.exp_name ${EXP_NAME} \ | |
| TRAIN.output_folder ${OUTPUT_DIR} | |