MRaCL / CGFormer /train_refzom.sh
dianecy's picture
Upload folder using huggingface_hub
ea1014e verified
#!/bin/bash
#SBATCH --job-name=sanity_check
#SBATCH --partition=a6000
#SBATCH --gres=gpu:4
#SBATCH --nodelist=node03
#SBATCH --time=13-11:30:00 # d-hh:mm:ss
#SBATCH --mem=80000 # CPU memory size
#SBATCH --cpus-per-task=12 # Number of CPU cores
#SBATCH --output=./filter_exp/refzom-sanity-vargatherv2.log
#sbatch train_refzom.sh exp_sanity/refzom repro48_vargather_v2node03
ml purge
ml load cuda/11.8
eval "$(conda shell.bash hook)"
conda activate risall
cd /data2/projects/chaeyun/CGFormer/
export NCCL_P2P_DISABLE=1
export NVIDIA_TF32_OVERRIDE=1
export NCCL_DEBUG=INFO
export NCCL_TIMEOUT=7200
export NCCL_IB_RETRY_CNT=15
if [ "$#" -ne 2 ]; then
echo "Usage: sbatch train.sh <OUTPUT_DIR> <EXP_NAME>"
exit 1
fi
# to change
GPUS=4
MASTER_PORT=2938
# input
OUTPUT_DIR=$1
EXP_NAME=$2
MARGIN=12
TEMP=0.07
MODE=repro_original
MLW=0.1
BATCH_SIZE=48
MIXUP_FQ=True
echo "Starting distributed training with ${GPUS} GPUs on port ${MASTER_PORT}..."
echo "Experiment Name: ${EXP_NAME}, Output Dir: ${OUTPUT_DIR}"
python -m torch.distributed.launch \
--nproc_per_node=${GPUS} \
--master_port=${MASTER_PORT} \
train_refzom_repro_2.py \
--config config/config_refzom_repro.yaml \
--opts TRAIN.batch_size ${BATCH_SIZE} \
TRAIN.exp_name ${EXP_NAME} \
TRAIN.output_folder ${OUTPUT_DIR}