LEE181204 commited on
Commit
405e866
·
verified ·
1 Parent(s): 91bf9a3

Upload slurm_script with huggingface_hub

Browse files
Files changed (1) hide show
  1. slurm_script +137 -0
slurm_script ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --account=punim0619
3
+ #SBATCH --job-name=Badvla_SVLA_fir
4
+ #SBATCH --nodes=1
5
+ #SBATCH --ntasks=1
6
+ #SBATCH --gres=gpu:2 # or more, up to 4
7
+ #SBATCH --mem=64G
8
+ #SBATCH --time=0-01:00:00
9
+ #SBATCH --partition=gpu-l40s # gpu-short is the debugging GPU
10
+ #SBATCH --output=debug_Badvla/slurm-%j.out
11
+
12
+ ############################################
13
+ # Environment & caches (from finetune.sh)
14
+ ############################################
15
+ export TORCH_EXTENSIONS_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache
16
+ export TRITON_CACHE_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache
17
+
18
+ # Load CUDA module
19
+ module load CUDA/12.4.1
20
+
21
+ # Workdir
22
+ cd /data/gpfs/projects/punim0619/lijiayu/SpatialVLA
23
+
24
+ ############################################
25
+ # Training config (from finetune_lora.sh)
26
+ ############################################
27
+ set -x
28
+
29
+ # Toggle quick debug mode
30
+ DEBUG=${DEBUG:-false}
31
+ if [ "$DEBUG" = true ]; then
32
+ GPUS=1
33
+ GPUS_PER_NODE=1
34
+ PER_DEVICE_BATCH_SIZE=2
35
+ shuffle_buffer_size=2
36
+ mixture=bridge_orig
37
+ NUM_WORKERS=0
38
+ TORCH_RUN_ARGS="--standalone --nnodes=1"
39
+ save_steps=50
40
+ fi
41
+
42
+ GPUS=${GPUS:-2}
43
+ GPUS_PER_NODE=${GPUS_PER_NODE:-2}
44
+ NODES=$((GPUS / GPUS_PER_NODE))
45
+ PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
46
+ BATCH_SIZE=${BATCH_SIZE:-$((GPUS * PER_DEVICE_BATCH_SIZE))}
47
+ GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
48
+
49
+ suite=libero_spatial
50
+ mixture=${suite}_no_noops
51
+ data_root_dir=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/modified_libero_rlds
52
+ model_name_or_path=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/base_model/${suite}
53
+ echo $mixture
54
+ echo $data_root_dir
55
+ save_dir="Badvla_${suite}_fir"
56
+
57
+ NUM_WORKERS=${NUM_WORKERS:-1}
58
+ shuffle_buffer_size=${shuffle_buffer_size:-8192} # large buffer for better shuffling
59
+
60
+ # LoRA / training hyperparams
61
+ lr=${lr:-5e-4}
62
+ lora=${lora:-4}
63
+ lora_alpha=${lora_alpha:-32}
64
+ lora_target=${lora_target:-"badfir"}
65
+ epoch=${epoch:-50}
66
+ save_steps=${save_steps:-1000}
67
+
68
+ cur_time=$(date "+%H-%M-%S")
69
+ date_dir=$(date "+%Y-%m-%d")
70
+ model_name_or_path=${model_name_or_path:-/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/pretrained/models--IPEC-COMMUNITY--spatialvla-4b-224-pt}
71
+ OUTPUT_DIR=${resume_path:-Badvla_model/${save_dir}}
72
+ mkdir -p "$OUTPUT_DIR"
73
+
74
+ # Helpful envs
75
+ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
76
+ export TF_CPP_MIN_LOG_LEVEL=3
77
+ # export LD_PRELOAD=../libtcmalloc.so.4.5.3 # optional, for memory management
78
+ # export TRITON_CACHE_DIR=~/.triton # already set above
79
+
80
+ # Keep a copy of this script in output
81
+ cp "$(realpath "$0")" "$OUTPUT_DIR"
82
+
83
+ # Torch launcher
84
+ export LAUNCHER="pytorch"
85
+ TORCH_RUN_ARGS=${TORCH_RUN_ARGS:-"--nnodes $NODES --nproc-per-node $GPUS_PER_NODE --master_port 29500"}
86
+
87
+ ############################################
88
+ # Launch training
89
+ ############################################
90
+ torchrun $TORCH_RUN_ARGS \
91
+ train/reproduce_Badvla.py \
92
+ --model_name_or_path ${model_name_or_path} \
93
+ ${ADAPT_ARGS} \
94
+ --lora "${lora}" \
95
+ --lora_alpha "${lora_alpha}" \
96
+ --lora_target "${lora_target}"\
97
+ --ignore_data_skip True \
98
+ --data_root_dir ${data_root_dir}\
99
+ --data_mix "${mixture}" \
100
+ --shuffle_buffer_size "${shuffle_buffer_size}" \
101
+ --obs_backward_steps 0 \
102
+ --obs_backward_delta 1 \
103
+ --action_forward_steps 3 \
104
+ --flash_attn True \
105
+ --output_dir "${OUTPUT_DIR}" \
106
+ --overwrite_output_dir False \
107
+ --freeze_vision_tower False \
108
+ --dataloader_num_workers "${NUM_WORKERS}" \
109
+ --bf16 True \
110
+ --tf32 True \
111
+ --num_train_epochs "${epoch}" \
112
+ --per_device_train_batch_size "${PER_DEVICE_BATCH_SIZE}" \
113
+ --gradient_accumulation_steps "${GRADIENT_ACC}" \
114
+ --save_strategy steps \
115
+ --save_steps "${save_steps}" \
116
+ --save_total_limit 3 \
117
+ --learning_rate "${lr}" \
118
+ --weight_decay 0.0 \
119
+ --warmup_ratio 0.005 \
120
+ --lr_scheduler_type cosine \
121
+ --logging_steps 500 \
122
+ --do_train True \
123
+ --grad_checkpoint True \
124
+ --deepspeed scripts/zero1.json \
125
+ --report_to tensorboard \
126
+ --log_level warning \
127
+ # --adpt_feature True
128
+
129
+ # python upload_huggingface.py \
130
+ # --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-50000" \
131
+ # --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"
132
+
133
+ # python upload_huggingface.py \
134
+ # --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-60000" \
135
+ # --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"
136
+
137
+ # rm -rf /data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR