BadSpa_object_fir / slurm_script
LEE181204's picture
Upload slurm_script with huggingface_hub
8e9fa21 verified
#!/bin/bash
#SBATCH --account=punim0619
#SBATCH --job-name=Badvla_SVLA_object_fir
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --gres=gpu:2 # or more, up to 4
#SBATCH --mem=64G
#SBATCH --time=0-01:00:00
#SBATCH --partition=gpu-l40s # gpu-short is the debugging GPU
#SBATCH --output=debug_Badvla/slurm-%j.out
############################################
# Environment & caches (from finetune.sh)
############################################
export TORCH_EXTENSIONS_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache
export TRITON_CACHE_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache
# Load CUDA module
module load CUDA/12.4.1
# Workdir
cd /data/gpfs/projects/punim0619/lijiayu/SpatialVLA
############################################
# Training config (from finetune_lora.sh)
############################################
set -x
# Toggle quick debug mode
DEBUG=${DEBUG:-false}
if [ "$DEBUG" = true ]; then
GPUS=1
GPUS_PER_NODE=1
PER_DEVICE_BATCH_SIZE=2
shuffle_buffer_size=2
mixture=bridge_orig
NUM_WORKERS=0
TORCH_RUN_ARGS="--standalone --nnodes=1"
save_steps=50
fi
GPUS=${GPUS:-2}
GPUS_PER_NODE=${GPUS_PER_NODE:-2}
NODES=$((GPUS / GPUS_PER_NODE))
PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
BATCH_SIZE=${BATCH_SIZE:-$((GPUS * PER_DEVICE_BATCH_SIZE))}
GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
suite=libero_object
mixture=${suite}_no_noops
data_root_dir=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/modified_libero_rlds
model_name_or_path=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/base_model/${suite}
echo $mixture
echo $data_root_dir
suite=$(echo $mixture | awk -F'_' '{print $2}')
save_dir="Badvla_${suite}_fir"
NUM_WORKERS=${NUM_WORKERS:-1}
shuffle_buffer_size=${shuffle_buffer_size:-8192} # large buffer for better shuffling
# LoRA / training hyperparams
lr=${lr:-5e-4}
lora=${lora:-4}
lora_alpha=${lora_alpha:-32}
lora_target=${lora_target:-"badfir"}
epoch=${epoch:-50}
save_steps=${save_steps:-1000}
cur_time=$(date "+%H-%M-%S")
date_dir=$(date "+%Y-%m-%d")
model_name_or_path=${model_name_or_path:-/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/pretrained/models--IPEC-COMMUNITY--spatialvla-4b-224-pt}
OUTPUT_DIR=${resume_path:-outputs/${save_dir}}
mkdir -p "$OUTPUT_DIR"
# Helpful envs
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export TF_CPP_MIN_LOG_LEVEL=3
# export LD_PRELOAD=../libtcmalloc.so.4.5.3 # optional, for memory management
# export TRITON_CACHE_DIR=~/.triton # already set above
# Keep a copy of this script in output
cp "$(realpath "$0")" "$OUTPUT_DIR"
# Torch launcher
export LAUNCHER="pytorch"
TORCH_RUN_ARGS=${TORCH_RUN_ARGS:-"--nnodes $NODES --nproc-per-node $GPUS_PER_NODE --master_port 29500"}
############################################
# Launch training
############################################
torchrun $TORCH_RUN_ARGS \
train/reproduce_Badvla.py \
--model_name_or_path ${model_name_or_path} \
${ADAPT_ARGS} \
--lora "${lora}" \
--lora_alpha "${lora_alpha}" \
--lora_target "${lora_target}"\
--ignore_data_skip True \
--data_root_dir ${data_root_dir}\
--data_mix "${mixture}" \
--shuffle_buffer_size "${shuffle_buffer_size}" \
--obs_backward_steps 0 \
--obs_backward_delta 1 \
--action_forward_steps 3 \
--flash_attn True \
--output_dir "${OUTPUT_DIR}" \
--overwrite_output_dir False \
--freeze_vision_tower False \
--dataloader_num_workers "${NUM_WORKERS}" \
--bf16 True \
--tf32 True \
--num_train_epochs "${epoch}" \
--per_device_train_batch_size "${PER_DEVICE_BATCH_SIZE}" \
--gradient_accumulation_steps "${GRADIENT_ACC}" \
--save_strategy steps \
--save_steps "${save_steps}" \
--save_total_limit 3 \
--learning_rate "${lr}" \
--weight_decay 0.0 \
--warmup_ratio 0.005 \
--lr_scheduler_type cosine \
--logging_steps 500 \
--do_train True \
--grad_checkpoint True \
--deepspeed scripts/zero1.json \
--report_to tensorboard \
--log_level warning \
# --adpt_feature True
# python upload_huggingface.py \
# --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-50000" \
# --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"
# python upload_huggingface.py \
# --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-60000" \
# --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"
# rm -rf /data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR