File size: 1,531 Bytes
729c925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/bin/bash
#SBATCH --job-name=rff_mevis_swint        # Job name
#SBATCH --partition=a6000             # Partition name
#SBATCH --gres=gpu:4                  # Use 2 GPUs
#SBATCH --time=14-04:30:00            # Time limit (14 days, 4 hours, 30 minutes)
#SBATCH --mem=80G                     # Memory allocation
#SBATCH --cpus-per-task=16             # Number of CPU cores per task
#SBATCH --output=logs/mevis_swin_tiny_trainlog.txt # Path to output log file

# Load necessary modules
ml purge
ml load cuda/11.8

# Activate conda environment
eval "$(conda shell.bash hook)"
conda activate videonemo

cd /data2/projects/chaeyun/NeMo/ReferFormer/

# Arguments passed to the script
OUTPUT_DIR=$1
PRETRAINED_WEIGHTS=$2
PY_ARGS=${@:3}  # Capture any additional arguments

# Log file path in the output directory
LOG_FILE=${OUTPUT_DIR}/training_log_$(date +"%Y%m%d_%H%M%S").log

echo "Loading pretrained weights from: ${PRETRAINED_WEIGHTS}"

# Ensure the output directory exists
mkdir -p ${OUTPUT_DIR}

# Launch training using torch.distributed.launch with SLURM's GPU allocation
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python3 -m torch.distributed.launch --nproc_per_node=4 --use_env \
main.py --with_box_refine --binary --freeze_text_encoder \
--epochs 6 --lr_drop 3 5 \
--dataset_file 'mevis' --batch_size 2 \
--output_dir=${OUTPUT_DIR} --pretrained_weights=${PRETRAINED_WEIGHTS} ${PY_ARGS} \
2>&1 | tee ${LOG_FILE}  # Redirect stdout and stderr to the log file

echo "Training completed. Output and logs saved to: ${OUTPUT_DIR}"