LEE181204 commited on
Commit
1b995c5
·
verified ·
1 Parent(s): 85e43bb

Upload slurm_script with huggingface_hub

Browse files
Files changed (1) hide show
  1. slurm_script +136 -0
slurm_script ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --account=punim0619
3
+ #SBATCH --job-name=Badvla_SVLA_spatial_sec
4
+ #SBATCH --nodes=1
5
+ #SBATCH --ntasks=1
6
+ #SBATCH --gres=gpu:2 # or more, up to 4
7
+ #SBATCH --mem=64G
8
+ #SBATCH --time=0-05:00:00
9
+ #SBATCH --partition=gpu-l40s # gpu-short is the debugging GPU
10
+ #SBATCH --output=slurm-%j.out
11
+
12
+ ############################################
13
+ # Environment & caches (from finetune.sh)
14
+ ############################################
15
+ export TORCH_EXTENSIONS_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache
16
+ export TRITON_CACHE_DIR=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/cache
17
+
18
+ # Load CUDA module
19
+ module load CUDA/12.4.1
20
+
21
+ # Workdir
22
+ cd /data/gpfs/projects/punim0619/lijiayu/SpatialVLA
23
+
24
+ ############################################
25
+ # Training config (from finetune_lora.sh)
26
+ ############################################
27
+ set -x
28
+
29
+ # Toggle quick debug mode
30
+ DEBUG=${DEBUG:-false}
31
+ if [ "$DEBUG" = true ]; then
32
+ GPUS=1
33
+ GPUS_PER_NODE=1
34
+ PER_DEVICE_BATCH_SIZE=2
35
+ shuffle_buffer_size=2
36
+ mixture=bridge_orig
37
+ NUM_WORKERS=0
38
+ TORCH_RUN_ARGS="--standalone --nnodes=1"
39
+ save_steps=50
40
+ fi
41
+
42
+ GPUS=${GPUS:-2}
43
+ GPUS_PER_NODE=${GPUS_PER_NODE:-2}
44
+ NODES=$((GPUS / GPUS_PER_NODE))
45
+ PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
46
+ BATCH_SIZE=${BATCH_SIZE:-$((GPUS * PER_DEVICE_BATCH_SIZE))}
47
+ GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
48
+
49
+ suite=libero_spatial
50
+ mixture=${suite}_no_noops
51
+ data_root_dir=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/modified_libero_rlds
52
+ model_name_or_path=/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/Badvla_model/Badvla_${suite}_fir
53
+ save_dir="Badvla_${suite}_sec"
54
+
55
+ NUM_WORKERS=${NUM_WORKERS:-1}
56
+ shuffle_buffer_size=${shuffle_buffer_size:-8192} # large buffer for better shuffling
57
+
58
+ # LoRA / training hyperparams
59
+ lr=${lr:-5e-5}
60
+ lora=${lora:-8}
61
+ lora_alpha=${lora_alpha:-32}
62
+ lora_target=${lora_target:-"badsec"}
63
+ epoch=${epoch:-15}
64
+ save_steps=${save_steps:-30000}
65
+
66
+ cur_time=$(date "+%H-%M-%S")
67
+ date_dir=$(date "+%Y-%m-%d")
68
+ model_name_or_path=${model_name_or_path:-/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/pretrained/models--IPEC-COMMUNITY--spatialvla-4b-224-pt}
69
+ echo $model_name_or_path
70
+ OUTPUT_DIR=${resume_path:-outputs/${save_dir}}
71
+ mkdir -p "$OUTPUT_DIR"
72
+
73
+ # Helpful envs
74
+ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
75
+ export TF_CPP_MIN_LOG_LEVEL=3
76
+ # export LD_PRELOAD=../libtcmalloc.so.4.5.3 # optional, for memory management
77
+ # export TRITON_CACHE_DIR=~/.triton # already set above
78
+
79
+ # Keep a copy of this script in output
80
+ cp "$(realpath "$0")" "$OUTPUT_DIR"
81
+
82
+ # Torch launcher
83
+ export LAUNCHER="pytorch"
84
+ TORCH_RUN_ARGS=${TORCH_RUN_ARGS:-"--nnodes $NODES --nproc-per-node $GPUS_PER_NODE --master_port 29503"}
85
+
86
+ ############################################
87
+ # Launch training
88
+ ############################################
89
+ torchrun $TORCH_RUN_ARGS \
90
+ train/spatialvla_finetune.py \
91
+ --model_name_or_path ${model_name_or_path} \
92
+ ${ADAPT_ARGS} \
93
+ --lora "${lora}" \
94
+ --lora_alpha "${lora_alpha}" \
95
+ --lora_target "${lora_target}" \
96
+ --ignore_data_skip True \
97
+ --data_root_dir ${data_root_dir}\
98
+ --data_mix "${mixture}" \
99
+ --shuffle_buffer_size "${shuffle_buffer_size}" \
100
+ --obs_backward_steps 0 \
101
+ --obs_backward_delta 1 \
102
+ --action_forward_steps 3 \
103
+ --flash_attn True \
104
+ --output_dir "${OUTPUT_DIR}" \
105
+ --overwrite_output_dir False \
106
+ --freeze_vision_tower False \
107
+ --dataloader_num_workers "${NUM_WORKERS}" \
108
+ --bf16 True \
109
+ --tf32 True \
110
+ --num_train_epochs "${epoch}" \
111
+ --per_device_train_batch_size "${PER_DEVICE_BATCH_SIZE}" \
112
+ --gradient_accumulation_steps "${GRADIENT_ACC}" \
113
+ --save_strategy steps \
114
+ --save_steps "${save_steps}" \
115
+ --save_total_limit 3 \
116
+ --learning_rate "${lr}" \
117
+ --weight_decay 0.0 \
118
+ --warmup_ratio 0.005 \
119
+ --lr_scheduler_type cosine \
120
+ --logging_steps 500 \
121
+ --do_train True \
122
+ --grad_checkpoint True \
123
+ --deepspeed scripts/zero1.json \
124
+ --report_to tensorboard \
125
+ --log_level warning \
126
+ # --adpt_feature True
127
+
128
+ # python upload_huggingface.py \
129
+ # --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-50000" \
130
+ # --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"
131
+
132
+ # python upload_huggingface.py \
133
+ # --folder-path "/data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR/checkpoint-60000" \
134
+ # --repo-name "LEE181204/${attack_type}_${poison_rate}_50000"
135
+
136
+ # rm -rf /data/gpfs/projects/punim0619/lijiayu/SpatialVLA/$OUTPUT_DIR