File size: 4,429 Bytes
405e866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash
#SBATCH --account=punim0619
#SBATCH --job-name=Badvla_SVLA_fir
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --gres=gpu:2              # or more, up to 4
#SBATCH --mem=64G
#SBATCH --time=0-01:00:00
#SBATCH --partition=gpu-l40s     # gpu-short is the debugging GPU
#SBATCH --output=debug_Badvla/slurm-%j.out

############################################
# Environment & caches (from finetune.sh)
############################################
export TORCH_EXTENSIONS_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache
export TRITON_CACHE_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache

# Load CUDA module
module load CUDA/12.4.1

# Workdir
cd /data/gpfs/projects/punim0619/lijiayu/SpatialVLA

############################################
# Training config (from finetune_lora.sh)
############################################
set -x

# Toggle quick debug mode
DEBUG=${DEBUG:-false}
if [ "$DEBUG" = true ]; then
  GPUS=1
  GPUS_PER_NODE=1
  PER_DEVICE_BATCH_SIZE=2
  shuffle_buffer_size=2
  mixture=bridge_orig
  NUM_WORKERS=0
  TORCH_RUN_ARGS="--standalone --nnodes=1"
  save_steps=50
fi

GPUS=${GPUS:-2}
GPUS_PER_NODE=${GPUS_PER_NODE:-2}
NODES=$((GPUS / GPUS_PER_NODE))
PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
BATCH_SIZE=${BATCH_SIZE:-$((GPUS * PER_DEVICE_BATCH_SIZE))}
GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))

suite=libero_spatial
mixture=${suite}_no_noops
data_root_dir=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/modified_libero_rlds
model_name_or_path=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/base_model/${suite}
echo $mixture
echo $data_root_dir
save_dir="Badvla_${suite}_fir"

NUM_WORKERS=${NUM_WORKERS:-1}
shuffle_buffer_size=${shuffle_buffer_size:-8192}        # large buffer for better shuffling

# LoRA / training hyperparams
lr=${lr:-5e-4}
lora=${lora:-4}
lora_alpha=${lora_alpha:-32}
lora_target=${lora_target:-"badfir"}
epoch=${epoch:-50}
save_steps=${save_steps:-1000}

cur_time=$(date "+%H-%M-%S")
date_dir=$(date "+%Y-%m-%d")
model_name_or_path=${model_name_or_path:-/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/pretrained/models--IPEC-COMMUNITY--spatialvla-4b-224-pt}
OUTPUT_DIR=${resume_path:-Badvla_model/${save_dir}}
mkdir -p "$OUTPUT_DIR"

# Helpful envs
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export TF_CPP_MIN_LOG_LEVEL=3
# export LD_PRELOAD=../libtcmalloc.so.4.5.3       # optional, for memory management
# export TRITON_CACHE_DIR=~/.triton               # already set above

# Keep a copy of this script in output
cp "$(realpath "$0")" "$OUTPUT_DIR"

# Torch launcher
export LAUNCHER="pytorch"
TORCH_RUN_ARGS=${TORCH_RUN_ARGS:-"--nnodes $NODES --nproc-per-node $GPUS_PER_NODE --master_port 29500"}

############################################
# Launch training
############################################
torchrun $TORCH_RUN_ARGS \
  train/reproduce_Badvla.py \
  --model_name_or_path ${model_name_or_path} \
  ${ADAPT_ARGS} \
  --lora "${lora}" \
  --lora_alpha "${lora_alpha}" \
  --lora_target "${lora_target}"\
  --ignore_data_skip True \
  --data_root_dir ${data_root_dir}\
  --data_mix "${mixture}" \
  --shuffle_buffer_size "${shuffle_buffer_size}" \
  --obs_backward_steps 0 \
  --obs_backward_delta 1 \
  --action_forward_steps 3 \
  --flash_attn True \
  --output_dir "${OUTPUT_DIR}" \
  --overwrite_output_dir False \
  --freeze_vision_tower False \
  --dataloader_num_workers "${NUM_WORKERS}" \
  --bf16 True \
  --tf32 True \
  --num_train_epochs "${epoch}" \
  --per_device_train_batch_size "${PER_DEVICE_BATCH_SIZE}" \
  --gradient_accumulation_steps "${GRADIENT_ACC}" \
  --save_strategy steps \
  --save_steps "${save_steps}" \
  --save_total_limit 3 \
  --learning_rate "${lr}" \
  --weight_decay 0.0 \
  --warmup_ratio 0.005 \
  --lr_scheduler_type cosine \
  --logging_steps 500 \
  --do_train True \
  --grad_checkpoint True \
  --deepspeed scripts/zero1.json \
  --report_to tensorboard \
  --log_level warning \
  # --adpt_feature True

# python upload_huggingface.py \
#   --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-50000" \
#   --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"

# python upload_huggingface.py \
#   --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-60000" \
#   --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"

# rm -rf /data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR