|
|
#!/bin/bash |
|
|
|
|
|
export LOWRES_RESIZE=384x32 |
|
|
export VIDEO_RESIZE="0x32" |
|
|
export HIGHRES_BASE="0x32" |
|
|
export MAXRES=1536 |
|
|
export MINRES=0 |
|
|
export VIDEO_MAXRES=448 |
|
|
export VIDEO_MINRES=288 |
|
|
export PAD2STRIDE=1 |
|
|
export FORCE_NO_DOWNSAMPLE=1 |
|
|
export LOAD_VISION_EARLY=1 |
|
|
|
|
|
export PYTHONPATH=/path/to/Ola:$PYTHONPATH |
|
|
|
|
|
EXP_NAME="ola_7b" |
|
|
DATA='/path/to/data.json' |
|
|
|
|
|
CHECKPOINT='/path/to/Ola_7b' |
|
|
|
|
|
echo $MASTER_ADDR; echo $nnode; echo $nrank |
|
|
|
|
|
torchrun --nproc_per_node 8 --nnodes=$nnode --node_rank=$nrank --master_addr=$MASTER_ADDR --master_port=12324 \ |
|
|
ola/train/train.py \ |
|
|
--deepspeed ./scripts/zero2.json \ |
|
|
--run_name $EXP_NAME \ |
|
|
--model_name_or_path $CHECKPOINT \ |
|
|
--pretrain_speech_projector $CHECKPOINT/speech_projector.bin \ |
|
|
--vision_tower $VISION_TOWER \ |
|
|
--mm_projector_type ola_mlp \ |
|
|
--speech_projector_type "linear" \ |
|
|
--mm_vision_select_layer -1 \ |
|
|
--mm_use_im_patch_token False \ |
|
|
--tune_speech_adapter False \ |
|
|
--version qwen_1_5 \ |
|
|
--data_path $DATA \ |
|
|
--bf16 True \ |
|
|
--output_dir ./checkpoints/$EXP_NAME \ |
|
|
--sample_independently True \ |
|
|
--fix_speech_encoder True \ |
|
|
--freeze_mm_vision_tower True \ |
|
|
--speech_encoder "./pretrained/large-v3.pt" \ |
|
|
--music_encoder "./pretrained/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt" \ |
|
|
--speech_encoder_type "dual" \ |
|
|
--speech_encoder_hidden_size 2048 \ |
|
|
--speech_encoder_ds_rate 10 \ |
|
|
--num_train_epochs 1 \ |
|
|
--per_device_train_batch_size 2 \ |
|
|
--per_device_eval_batch_size 1 \ |
|
|
--gradient_accumulation_steps 1 \ |
|
|
--evaluation_strategy "no" \ |
|
|
--save_strategy "steps" \ |
|
|
--save_steps 1000 \ |
|
|
--save_total_limit 1 \ |
|
|
--learning_rate 1e-5 \ |
|
|
--weight_decay 0.0 \ |
|
|
--warmup_ratio 0.05 \ |
|
|
--min_lr_ratio 0.01 \ |
|
|
--lr_scheduler_type "cosine" \ |
|
|
--logging_steps 1 \ |
|
|
--tf32 True \ |
|
|
--model_max_length 16384 \ |
|
|
--gradient_checkpointing True \ |
|
|
--dataloader_num_workers 8 \ |
|
|
--frames_upbound 48 \ |
|
|
--lazy_preprocess True \ |
|
|
--report_to none |