| #!/bin/bash | |
| #SBATCH --account=punim0619 | |
| #SBATCH --job-name=Badvla_SVLA_object_fir | |
| #SBATCH --nodes=1 | |
| #SBATCH --ntasks=1 | |
| #SBATCH --gres=gpu:2 # or more, up to 4 | |
| #SBATCH --mem=64G | |
| #SBATCH --time=0-01:00:00 | |
| #SBATCH --partition=gpu-l40s # gpu-short is the debugging GPU | |
| #SBATCH --output=debug_Badvla/slurm-%j.out | |
| ############################################ | |
| # Environment & caches (from finetune.sh) | |
| ############################################ | |
| export TORCH_EXTENSIONS_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache | |
| export TRITON_CACHE_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache | |
| # Load CUDA module | |
| module load CUDA/12.4.1 | |
| # Workdir | |
| cd /data/gpfs/projects/punim0619/lijiayu/SpatialVLA | |
| ############################################ | |
| # Training config (from finetune_lora.sh) | |
| ############################################ | |
| set -x | |
| # Toggle quick debug mode | |
| DEBUG=${DEBUG:-false} | |
| if [ "$DEBUG" = true ]; then | |
| GPUS=1 | |
| GPUS_PER_NODE=1 | |
| PER_DEVICE_BATCH_SIZE=2 | |
| shuffle_buffer_size=2 | |
| mixture=bridge_orig | |
| NUM_WORKERS=0 | |
| TORCH_RUN_ARGS="--standalone --nnodes=1" | |
| save_steps=50 | |
| fi | |
| GPUS=${GPUS:-2} | |
| GPUS_PER_NODE=${GPUS_PER_NODE:-2} | |
| NODES=$((GPUS / GPUS_PER_NODE)) | |
| PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1} | |
| BATCH_SIZE=${BATCH_SIZE:-$((GPUS * PER_DEVICE_BATCH_SIZE))} | |
| GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) | |
| suite=libero_object | |
| mixture=${suite}_no_noops | |
| data_root_dir=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/modified_libero_rlds | |
| model_name_or_path=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/base_model/${suite} | |
| echo $mixture | |
| echo $data_root_dir | |
| suite=$(echo $mixture | awk -F'_' '{print $2}') | |
| save_dir="Badvla_${suite}_fir" | |
| NUM_WORKERS=${NUM_WORKERS:-1} | |
| shuffle_buffer_size=${shuffle_buffer_size:-8192} # large buffer for better shuffling | |
| # LoRA / training hyperparams | |
| lr=${lr:-5e-4} | |
| lora=${lora:-4} | |
| lora_alpha=${lora_alpha:-32} | |
| lora_target=${lora_target:-"badfir"} | |
| epoch=${epoch:-50} | |
| save_steps=${save_steps:-1000} | |
| cur_time=$(date "+%H-%M-%S") | |
| date_dir=$(date "+%Y-%m-%d") | |
| model_name_or_path=${model_name_or_path:-/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/pretrained/models--IPEC-COMMUNITY--spatialvla-4b-224-pt} | |
| OUTPUT_DIR=${resume_path:-outputs/${save_dir}} | |
| mkdir -p "$OUTPUT_DIR" | |
| # Helpful envs | |
| export PYTHONPATH="${PYTHONPATH}:$(pwd)" | |
| export TF_CPP_MIN_LOG_LEVEL=3 | |
| # export LD_PRELOAD=../libtcmalloc.so.4.5.3 # optional, for memory management | |
| # export TRITON_CACHE_DIR=~/.triton # already set above | |
| # Keep a copy of this script in output | |
| cp "$(realpath "$0")" "$OUTPUT_DIR" | |
| # Torch launcher | |
| export LAUNCHER="pytorch" | |
| TORCH_RUN_ARGS=${TORCH_RUN_ARGS:-"--nnodes $NODES --nproc-per-node $GPUS_PER_NODE --master_port 29500"} | |
| ############################################ | |
| # Launch training | |
| ############################################ | |
| torchrun $TORCH_RUN_ARGS \ | |
| train/reproduce_Badvla.py \ | |
| --model_name_or_path ${model_name_or_path} \ | |
| ${ADAPT_ARGS} \ | |
| --lora "${lora}" \ | |
| --lora_alpha "${lora_alpha}" \ | |
| --lora_target "${lora_target}"\ | |
| --ignore_data_skip True \ | |
| --data_root_dir ${data_root_dir}\ | |
| --data_mix "${mixture}" \ | |
| --shuffle_buffer_size "${shuffle_buffer_size}" \ | |
| --obs_backward_steps 0 \ | |
| --obs_backward_delta 1 \ | |
| --action_forward_steps 3 \ | |
| --flash_attn True \ | |
| --output_dir "${OUTPUT_DIR}" \ | |
| --overwrite_output_dir False \ | |
| --freeze_vision_tower False \ | |
| --dataloader_num_workers "${NUM_WORKERS}" \ | |
| --bf16 True \ | |
| --tf32 True \ | |
| --num_train_epochs "${epoch}" \ | |
| --per_device_train_batch_size "${PER_DEVICE_BATCH_SIZE}" \ | |
| --gradient_accumulation_steps "${GRADIENT_ACC}" \ | |
| --save_strategy steps \ | |
| --save_steps "${save_steps}" \ | |
| --save_total_limit 3 \ | |
| --learning_rate "${lr}" \ | |
| --weight_decay 0.0 \ | |
| --warmup_ratio 0.005 \ | |
| --lr_scheduler_type cosine \ | |
| --logging_steps 500 \ | |
| --do_train True \ | |
| --grad_checkpoint True \ | |
| --deepspeed scripts/zero1.json \ | |
| --report_to tensorboard \ | |
| --log_level warning \ | |
| # --adpt_feature True | |
| # python upload_huggingface.py \ | |
| # --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-50000" \ | |
| # --repo-name "LEE181204/${attack_type}_${poison_rate}_50000" | |
| # python upload_huggingface.py \ | |
| # --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-60000" \ | |
| # --repo-name "LEE181204/${attack_type}_${poison_rate}_50000" | |
| # rm -rf /data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR |