| #!/bin/bash |
| #SBATCH --account=punim0619 |
| #SBATCH --job-name=Badvla_SVLA_object_fir |
| #SBATCH --nodes=1 |
| #SBATCH --ntasks=1 |
| #SBATCH --gres=gpu:2 # or more, up to 4 |
| #SBATCH --mem=64G |
| #SBATCH --time=0-01:00:00 |
| #SBATCH --partition=gpu-l40s # gpu-short is the debugging GPU |
| #SBATCH --output=debug_Badvla/slurm-%j.out |
|
|
| ############################################ |
| # Environment & caches (from finetune.sh) |
| ############################################ |
| export TORCH_EXTENSIONS_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache |
| export TRITON_CACHE_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache |
|
|
| # Load CUDA module |
| module load CUDA/12.4.1 |
|
|
| # Workdir |
| cd /data/gpfs/projects/punim0619/lijiayu/SpatialVLA |
|
|
| ############################################ |
| # Training config (from finetune_lora.sh) |
| ############################################ |
| set -x |
|
|
| # Toggle quick debug mode |
| DEBUG=${DEBUG:-false} |
| if [ "$DEBUG" = true ]; then |
| GPUS=1 |
| GPUS_PER_NODE=1 |
| PER_DEVICE_BATCH_SIZE=2 |
| shuffle_buffer_size=2 |
| mixture=bridge_orig |
| NUM_WORKERS=0 |
| TORCH_RUN_ARGS="--standalone --nnodes=1" |
| save_steps=50 |
| fi |
|
|
| GPUS=${GPUS:-2} |
| GPUS_PER_NODE=${GPUS_PER_NODE:-2} |
| NODES=$((GPUS / GPUS_PER_NODE)) |
| PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1} |
| BATCH_SIZE=${BATCH_SIZE:-$((GPUS * PER_DEVICE_BATCH_SIZE))} |
| GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS)) |
|
|
| suite=libero_object |
| mixture=${suite}_no_noops |
| data_root_dir=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/modified_libero_rlds |
| model_name_or_path=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/base_model/${suite} |
| echo $mixture |
| echo $data_root_dir |
| suite=$(echo $mixture | awk -F'_' '{print $2}') |
| save_dir="Badvla_${suite}_fir" |
|
|
| NUM_WORKERS=${NUM_WORKERS:-1} |
| shuffle_buffer_size=${shuffle_buffer_size:-8192} # large buffer for better shuffling |
|
|
| # LoRA / training hyperparams |
| lr=${lr:-5e-4} |
| lora=${lora:-4} |
| lora_alpha=${lora_alpha:-32} |
| lora_target=${lora_target:-"badfir"} |
| epoch=${epoch:-50} |
| save_steps=${save_steps:-1000} |
|
|
| cur_time=$(date "+%H-%M-%S") |
| date_dir=$(date "+%Y-%m-%d") |
| model_name_or_path=${model_name_or_path:-/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/pretrained/models--IPEC-COMMUNITY--spatialvla-4b-224-pt} |
| OUTPUT_DIR=${resume_path:-outputs/${save_dir}} |
| mkdir -p "$OUTPUT_DIR" |
|
|
| # Helpful envs |
| export PYTHONPATH="${PYTHONPATH}:$(pwd)" |
| export TF_CPP_MIN_LOG_LEVEL=3 |
| # export LD_PRELOAD=../libtcmalloc.so.4.5.3 # optional, for memory management |
| # export TRITON_CACHE_DIR=~/.triton # already set above |
|
|
| # Keep a copy of this script in output |
| cp "$(realpath "$0")" "$OUTPUT_DIR" |
|
|
| # Torch launcher |
| export LAUNCHER="pytorch" |
| TORCH_RUN_ARGS=${TORCH_RUN_ARGS:-"--nnodes $NODES --nproc-per-node $GPUS_PER_NODE --master_port 29500"} |
|
|
| ############################################ |
| # Launch training |
| ############################################ |
| torchrun $TORCH_RUN_ARGS \ |
| train/reproduce_Badvla.py \ |
| --model_name_or_path ${model_name_or_path} \ |
| ${ADAPT_ARGS} \ |
| --lora "${lora}" \ |
| --lora_alpha "${lora_alpha}" \ |
| --lora_target "${lora_target}"\ |
| --ignore_data_skip True \ |
| --data_root_dir ${data_root_dir}\ |
| --data_mix "${mixture}" \ |
| --shuffle_buffer_size "${shuffle_buffer_size}" \ |
| --obs_backward_steps 0 \ |
| --obs_backward_delta 1 \ |
| --action_forward_steps 3 \ |
| --flash_attn True \ |
| --output_dir "${OUTPUT_DIR}" \ |
| --overwrite_output_dir False \ |
| --freeze_vision_tower False \ |
| --dataloader_num_workers "${NUM_WORKERS}" \ |
| --bf16 True \ |
| --tf32 True \ |
| --num_train_epochs "${epoch}" \ |
| --per_device_train_batch_size "${PER_DEVICE_BATCH_SIZE}" \ |
| --gradient_accumulation_steps "${GRADIENT_ACC}" \ |
| --save_strategy steps \ |
| --save_steps "${save_steps}" \ |
| --save_total_limit 3 \ |
| --learning_rate "${lr}" \ |
| --weight_decay 0.0 \ |
| --warmup_ratio 0.005 \ |
| --lr_scheduler_type cosine \ |
| --logging_steps 500 \ |
| --do_train True \ |
| --grad_checkpoint True \ |
| --deepspeed scripts/zero1.json \ |
| --report_to tensorboard \ |
| --log_level warning \ |
| # --adpt_feature True |
|
|
| # python upload_huggingface.py \ |
| # --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-50000" \ |
| # --repo-name "LEE181204/${attack_type}_${poison_rate}_50000" |
|
|
| # python upload_huggingface.py \ |
| # --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-60000" \ |
| # --repo-name "LEE181204/${attack_type}_${poison_rate}_50000" |
|
|
| # rm -rf /data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR |