dreilly commited on
Commit
6dcd634
·
verified ·
1 Parent(s): 1a601cf

Delete viscop_qwen2.5_7b_viscop-lora_egocentric-expert/train_viscop.sh

Browse files
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/train_viscop.sh DELETED
@@ -1,124 +0,0 @@
1
- #!/bin/bash
2
- # Environment Variables
3
- ARG_WORLD_SIZE=${1:-1}
4
- ARG_NPROC_PER_NODE=${2:-8}
5
-
6
- if [[ -v MASTER_ADDR_PASSED ]]; then
7
- ARG_MASTER_ADDR=$MASTER_ADDR_PASSED # passed via slurm submission script
8
- else
9
- ARG_MASTER_ADDR=127.0.0.1 # for dev environments
10
- fi
11
- ARG_MASTER_PORT=12355
12
- # ARG_RANK=$SLURM_NODEID
13
- ARG_RANK=0
14
-
15
- # Multiple conditions
16
- if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
17
- WORLD_SIZE=$ARG_WORLD_SIZE
18
- NPROC_PER_NODE=$ARG_NPROC_PER_NODE
19
- fi
20
-
21
- if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
22
- MASTER_ADDR=$ARG_MASTER_ADDR
23
- MASTER_PORT=$ARG_MASTER_PORT
24
- RANK=$ARG_RANK
25
- fi
26
-
27
- echo "MASTER_ADDR: $MASTER_ADDR. MASTER_PORT: $MASTER_PORT. RANK: $RANK"
28
- echo "WORLD_SIZE: $WORLD_SIZE"
29
- echo "NPROC_PER_NODE: $NPROC_PER_NODE"
30
-
31
- # Training Arguments
32
- GLOBAL_BATCH_SIZE=128 # aka effective batch size
33
- LOCAL_BATCH_SIZE=8 # batch size per GPU
34
- GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
35
- echo $GRADIENT_ACCUMULATION_STEPS
36
-
37
- INIT_MODEL=/home/dreilly1/Projects/VisCoP/base_vlm/videollama3-image_7b_local # path to base VLM (for ViSCoP we use VideoLLaMA3 as the base VLM)
38
-
39
- NUM_DATA_WORKERS=8
40
- NUM_TRAIN_EPOCHS=3
41
- LORA_TRAINING=True
42
-
43
- # ViSCoP Arguments
44
- NUM_VISUAL_PROBES=16
45
- INTERACTION_MODULE_POS=all
46
- PASS_PROBES_TO_LLM=True
47
- PASS_VIS_FEATURES_TO_LLM=True
48
-
49
- # Logging Arguments
50
- export WANDB_PROJECT=sony26_mm_viscop
51
- REPORT_TO=wandb
52
- OUTP_DIR=work_dirs/egoexo
53
- RUN_NAME=viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer
54
-
55
- # Data Arguments
56
- # DATA_DIR=/home/dreilly1/Projects/paired_egoexo_dl/paired_videos_DATA/
57
- DATA_DIR=/data/dreilly1/EgoExo4D_symlink/
58
- TRAINING_JSON="/home/dreilly1/Projects/viscop_sony/training_jsons/train-instr_viscop_egoview_ALLDATA.json"
59
-
60
- # if [[ $TRAINING_JSON == *"egoview"* ]]; then
61
- # MAX_FRAMES=40 # use 40 frames for training on ego
62
- # else
63
- # MAX_FRAMES=180
64
- # fi
65
- MAX_FRAMES=180
66
-
67
- # Optional Arguments. Set TESTING to 1 to quickly test the training script without logging or data workers, useful for debugging
68
- TESTING=0
69
- if [ $TESTING -eq 1 ]; then
70
- NUM_DATA_WORKERS=0
71
- REPORT_TO=none
72
- RUN_NAME=TESTING
73
- fi
74
-
75
- mkdir -p "${OUTP_DIR}/${RUN_NAME}/"
76
- cp "$0" "${OUTP_DIR}/${RUN_NAME}/"
77
-
78
- torchrun --nnodes $WORLD_SIZE \
79
- --nproc_per_node $NPROC_PER_NODE \
80
- --master_addr=$MASTER_ADDR \
81
- --master_port=$MASTER_PORT \
82
- --node_rank $RANK \
83
- viscop/train_viscop.py \
84
- --interaction_module_layers $INTERACTION_MODULE_POS \
85
- --lora_enable $LORA_TRAINING \
86
- --num_train_epochs $NUM_TRAIN_EPOCHS \
87
- --deepspeed scripts/zero2.json \
88
- --model_type viscop_qwen2 \
89
- --model_path $INIT_MODEL \
90
- --vision_encoder DAMO-NLP-SG/SigLIP-NaViT \
91
- --mm_projector_type mlp2x_gelu \
92
- --data_path $TRAINING_JSON \
93
- --data_folder $DATA_DIR \
94
- --image_merge_size 2 \
95
- --video_merge_size 2 \
96
- --fps 1 \
97
- --max_frames $MAX_FRAMES \
98
- --model_max_length 16384 \
99
- --mm_max_length 10240 \
100
- --bf16 True \
101
- --tf32 True \
102
- --fp16 False \
103
- --output_dir ${OUTP_DIR}/${RUN_NAME} \
104
- --per_device_train_batch_size $LOCAL_BATCH_SIZE \
105
- --per_device_eval_batch_size 2 \
106
- --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
107
- --evaluation_strategy "no" \
108
- --save_strategy "no" \
109
- --save_steps 5000 \
110
- --save_total_limit 1 \
111
- --mm_projector_lr 1e-5 \
112
- --llm_lr 1e-5 \
113
- --weight_decay 0. \
114
- --warmup_ratio 0.03 \
115
- --lr_scheduler_type "cosine" \
116
- --logging_steps 1 \
117
- --gradient_checkpointing True \
118
- --dataloader_num_workers $NUM_DATA_WORKERS \
119
- --report_to $REPORT_TO \
120
- --run_name $RUN_NAME \
121
- --dataset_cache_dir /home/dreilly1/.cache/viscop_datasetcache \
122
- --include_visual_tokens $PASS_VIS_FEATURES_TO_LLM \
123
- --include_visual_probes $PASS_PROBES_TO_LLM \
124
- --num_visual_probes $NUM_VISUAL_PROBES