#!/bin/bash export LOWRES_RESIZE=384x32 export VIDEO_RESIZE="0x32" export HIGHRES_BASE="0x32" export MAXRES=1536 export MINRES=0 export VIDEO_MAXRES=448 export VIDEO_MINRES=288 export PAD2STRIDE=1 export FORCE_NO_DOWNSAMPLE=1 # export LOAD_VISION_EARLY=1 export PYTHONPATH=/path/to/Ola:$PYTHONPATH EXP_NAME="ola_audio_8_8gpu" DATA='/data1/cxy/plm-v/modeling/data/audio_test.json' CHECKPOINT='/data1/cxy/plm-v/modeling/plm_internvl3_5_ola' echo $MASTER_ADDR; echo $nnode; echo $nrank nnode=1 nrank=0 MASTER_ADDR=localhost MASTER_PORT=12324 PROJECT=/data1/cxy/plm-v/modeling/Ola VISION_TOWER=null torchrun --nproc_per_node 8 --nnodes=$nnode --node_rank=$nrank --master_addr=$MASTER_ADDR --master_port=12324 \ /data1/cxy/plm-v/modeling/Ola/ola/train/train.py \ --deepspeed $PROJECT/scripts/zero2.json \ --run_name $EXP_NAME \ --model_name_or_path $CHECKPOINT \ --vision_tower $VISION_TOWER \ --mm_projector_type ola_internvl \ --mm_vision_select_layer -1 \ --mm_use_im_patch_token False \ --tune_speech_adapter True \ --version plm_v \ --data_path $DATA \ --bf16 True \ --output_dir /data1/cxy/plm-v/modeling/ckpt/$EXP_NAME \ --sample_independently True \ --fix_speech_encoder True \ --freeze_mm_vision_tower True \ --speech_encoder_type "dual" \ --speech_encoder_hidden_size 2048 \ --speech_encoder_ds_rate 10 \ --num_train_epochs 10 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 1 \ --save_strategy "steps" \ --save_steps 100 \ --save_total_limit 100 \ --learning_rate 1e-5 \ --weight_decay 0.01 \ --warmup_ratio 0.01 \ --min_lr_ratio 0.1 \ --max_grad_norm 5.0 \ --lr_scheduler_type "cosine" \ --logging_steps 1 \ --tf32 True \ --disable_tqdm False \ --dataloader_pin_memory False \ --model_max_length 16384 \ --gradient_checkpointing True \ --dataloader_num_workers 8 \ --frames_upbound 64 \ --report_to none