File size: 4,429 Bytes
405e866 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
#!/bin/bash
#SBATCH --account=punim0619
#SBATCH --job-name=Badvla_SVLA_fir
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --gres=gpu:2 # or more, up to 4
#SBATCH --mem=64G
#SBATCH --time=0-01:00:00
#SBATCH --partition=gpu-l40s # gpu-short is the debugging GPU
#SBATCH --output=debug_Badvla/slurm-%j.out
############################################
# Environment & caches (from finetune.sh)
############################################
export TORCH_EXTENSIONS_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache
export TRITON_CACHE_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache
# Load CUDA module
module load CUDA/12.4.1
# Workdir
cd /data/gpfs/projects/punim0619/lijiayu/SpatialVLA
############################################
# Training config (from finetune_lora.sh)
############################################
set -x
# Toggle quick debug mode
DEBUG=${DEBUG:-false}
if [ "$DEBUG" = true ]; then
GPUS=1
GPUS_PER_NODE=1
PER_DEVICE_BATCH_SIZE=2
shuffle_buffer_size=2
mixture=bridge_orig
NUM_WORKERS=0
TORCH_RUN_ARGS="--standalone --nnodes=1"
save_steps=50
fi
GPUS=${GPUS:-2}
GPUS_PER_NODE=${GPUS_PER_NODE:-2}
NODES=$((GPUS / GPUS_PER_NODE))
PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
BATCH_SIZE=${BATCH_SIZE:-$((GPUS * PER_DEVICE_BATCH_SIZE))}
GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
suite=libero_spatial
mixture=${suite}_no_noops
data_root_dir=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/modified_libero_rlds
model_name_or_path=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/base_model/${suite}
echo $mixture
echo $data_root_dir
save_dir="Badvla_${suite}_fir"
NUM_WORKERS=${NUM_WORKERS:-1}
shuffle_buffer_size=${shuffle_buffer_size:-8192} # large buffer for better shuffling
# LoRA / training hyperparams
lr=${lr:-5e-4}
lora=${lora:-4}
lora_alpha=${lora_alpha:-32}
lora_target=${lora_target:-"badfir"}
epoch=${epoch:-50}
save_steps=${save_steps:-1000}
cur_time=$(date "+%H-%M-%S")
date_dir=$(date "+%Y-%m-%d")
model_name_or_path=${model_name_or_path:-/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/pretrained/models--IPEC-COMMUNITY--spatialvla-4b-224-pt}
OUTPUT_DIR=${resume_path:-Badvla_model/${save_dir}}
mkdir -p "$OUTPUT_DIR"
# Helpful envs
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export TF_CPP_MIN_LOG_LEVEL=3
# export LD_PRELOAD=../libtcmalloc.so.4.5.3 # optional, for memory management
# export TRITON_CACHE_DIR=~/.triton # already set above
# Keep a copy of this script in output
cp "$(realpath "$0")" "$OUTPUT_DIR"
# Torch launcher
export LAUNCHER="pytorch"
TORCH_RUN_ARGS=${TORCH_RUN_ARGS:-"--nnodes $NODES --nproc-per-node $GPUS_PER_NODE --master_port 29500"}
############################################
# Launch training
############################################
torchrun $TORCH_RUN_ARGS \
train/reproduce_Badvla.py \
--model_name_or_path ${model_name_or_path} \
${ADAPT_ARGS} \
--lora "${lora}" \
--lora_alpha "${lora_alpha}" \
--lora_target "${lora_target}"\
--ignore_data_skip True \
--data_root_dir ${data_root_dir}\
--data_mix "${mixture}" \
--shuffle_buffer_size "${shuffle_buffer_size}" \
--obs_backward_steps 0 \
--obs_backward_delta 1 \
--action_forward_steps 3 \
--flash_attn True \
--output_dir "${OUTPUT_DIR}" \
--overwrite_output_dir False \
--freeze_vision_tower False \
--dataloader_num_workers "${NUM_WORKERS}" \
--bf16 True \
--tf32 True \
--num_train_epochs "${epoch}" \
--per_device_train_batch_size "${PER_DEVICE_BATCH_SIZE}" \
--gradient_accumulation_steps "${GRADIENT_ACC}" \
--save_strategy steps \
--save_steps "${save_steps}" \
--save_total_limit 3 \
--learning_rate "${lr}" \
--weight_decay 0.0 \
--warmup_ratio 0.005 \
--lr_scheduler_type cosine \
--logging_steps 500 \
--do_train True \
--grad_checkpoint True \
--deepspeed scripts/zero1.json \
--report_to tensorboard \
--log_level warning \
# --adpt_feature True
# python upload_huggingface.py \
# --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-50000" \
# --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"
# python upload_huggingface.py \
# --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-60000" \
# --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"
# rm -rf /data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR |