plm_internvl_ola_code / scripts /finetune_ola.sh
jjw0126's picture
Upload files
84ff315 verified
#!/bin/bash
export LOWRES_RESIZE=384x32
export VIDEO_RESIZE="0x32"
export HIGHRES_BASE="0x32"
export MAXRES=1536
export MINRES=0
export VIDEO_MAXRES=448
export VIDEO_MINRES=288
export PAD2STRIDE=1
export FORCE_NO_DOWNSAMPLE=1
export LOAD_VISION_EARLY=1
export PYTHONPATH=/path/to/Ola:$PYTHONPATH
EXP_NAME="ola_7b"
DATA='/path/to/data.json'
CHECKPOINT='/path/to/Ola_7b'
echo $MASTER_ADDR; echo $nnode; echo $nrank
torchrun --nproc_per_node 8 --nnodes=$nnode --node_rank=$nrank --master_addr=$MASTER_ADDR --master_port=12324 \
ola/train/train.py \
--deepspeed ./scripts/zero2.json \
--run_name $EXP_NAME \
--model_name_or_path $CHECKPOINT \
--pretrain_speech_projector $CHECKPOINT/speech_projector.bin \
--vision_tower $VISION_TOWER \
--mm_projector_type ola_mlp \
--speech_projector_type "linear" \
--mm_vision_select_layer -1 \
--mm_use_im_patch_token False \
--tune_speech_adapter False \
--version qwen_1_5 \
--data_path $DATA \
--bf16 True \
--output_dir ./checkpoints/$EXP_NAME \
--sample_independently True \
--fix_speech_encoder True \
--freeze_mm_vision_tower True \
--speech_encoder "./pretrained/large-v3.pt" \
--music_encoder "./pretrained/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt" \
--speech_encoder_type "dual" \
--speech_encoder_hidden_size 2048 \
--speech_encoder_ds_rate 10 \
--num_train_epochs 1 \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 1000 \
--save_total_limit 1 \
--learning_rate 1e-5 \
--weight_decay 0.0 \
--warmup_ratio 0.05 \
--min_lr_ratio 0.01 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 16384 \
--gradient_checkpointing True \
--dataloader_num_workers 8 \
--frames_upbound 48 \
--lazy_preprocess True \
--report_to none