plm_internvl_ola_code / scripts /test /finetune_ola_audio.sh
jjw0126's picture
Upload files
84ff315 verified
#!/bin/bash
export LOWRES_RESIZE=384x32
export VIDEO_RESIZE="0x32"
export HIGHRES_BASE="0x32"
export MAXRES=1536
export MINRES=0
export VIDEO_MAXRES=448
export VIDEO_MINRES=288
export PAD2STRIDE=1
export FORCE_NO_DOWNSAMPLE=1
# export LOAD_VISION_EARLY=1
export PYTHONPATH=/path/to/Ola:$PYTHONPATH
EXP_NAME="ola_audio_8_8gpu"
DATA='/data1/cxy/plm-v/modeling/data/audio_test.json'
CHECKPOINT='/data1/cxy/plm-v/modeling/plm_internvl3_5_ola'
echo $MASTER_ADDR; echo $nnode; echo $nrank
nnode=1
nrank=0
MASTER_ADDR=localhost
MASTER_PORT=12324
PROJECT=/data1/cxy/plm-v/modeling/Ola
VISION_TOWER=null
torchrun --nproc_per_node 8 --nnodes=$nnode --node_rank=$nrank --master_addr=$MASTER_ADDR --master_port=12324 \
/data1/cxy/plm-v/modeling/Ola/ola/train/train.py \
--deepspeed $PROJECT/scripts/zero2.json \
--run_name $EXP_NAME \
--model_name_or_path $CHECKPOINT \
--vision_tower $VISION_TOWER \
--mm_projector_type ola_internvl \
--mm_vision_select_layer -1 \
--mm_use_im_patch_token False \
--tune_speech_adapter True \
--version plm_v \
--data_path $DATA \
--bf16 True \
--output_dir /data1/cxy/plm-v/modeling/ckpt/$EXP_NAME \
--sample_independently True \
--fix_speech_encoder True \
--freeze_mm_vision_tower True \
--speech_encoder_type "dual" \
--speech_encoder_hidden_size 2048 \
--speech_encoder_ds_rate 10 \
--num_train_epochs 10 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 1 \
--save_strategy "steps" \
--save_steps 100 \
--save_total_limit 100 \
--learning_rate 1e-5 \
--weight_decay 0.01 \
--warmup_ratio 0.01 \
--min_lr_ratio 0.1 \
--max_grad_norm 5.0 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--disable_tqdm False \
--dataloader_pin_memory False \
--model_max_length 16384 \
--gradient_checkpointing True \
--dataloader_num_workers 8 \
--frames_upbound 64 \
--report_to none