| # 8-GPU training script for video diffusion model | |
| # Usage: bash train/train_video.sh | |
| accelerate launch --multi_gpu --gpu_ids '0,1,2,3,4,5,6,7' --main_process_port 25011 --num_processes 8 \ | |
| train/train_mei_video.py \ | |
| --use_precomputed_features \ | |
| --features_dir /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_128_128_with_mask_attn_debug \ | |
| --text_encoder_architecture umt5-xxl \ | |
| --wan_pretrained_path /mnt/Wan2.1-T2V-1.3B \ | |
| --training_from_scratch True \ | |
| --pretrained_model_name_or_path "dummy" \ | |
| --wan_backbone_lr_ratio 0.2 \ | |
| --num_frames 17 \ | |
| --video_height 128 \ | |
| --video_width 128 \ | |
| --dataloader_num_workers 8 \ | |
| --video_tokenizer_model_id "Cosmos-0.1-Tokenizer-DV4x8x8" \ | |
| --instance_dataset OpenVid1MDataset \ | |
| --instance_data_dir "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ | |
| --train_batch_size 8 \ | |
| --gradient_accumulation_steps 4 \ | |
| --learning_rate 3e-4 \ | |
| --max_train_steps 10000 \ | |
| --checkpointing_steps 500 \ | |
| --validation_steps 500 \ | |
| --logging_steps 10 \ | |
| --validation_prompts "a cat playing" "a girl walking" \ | |
| --output_dir "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio" \ | |
| --mixed_precision bf16 \ | |
| --lr_scheduler constant \ | |
| --lr_warmup_steps 0 \ | |
| --use_8bit_adam \ | |
| --gradient_checkpointing \ | |
| --min_masking_rate 0.0 \ | |
| --cond_dropout_prob 0.1 \ | |
| --split_vae_encode 1 \ | |
| --allow_tf32 \ | |
| --seed 42 \ | |
| --report_to wandb | |
| # --use_precomputed_features \ | |
| # --features_dir /mnt/VideoGen/dataset/OpenVid1M/extracted_features \ | |
| # accelerate launch --multi_gpu --gpu_ids '0,1,2,3,4,5,6,7' --main_process_port 25011 --num_processes 8 \ | |
| # train/train_mei_video.py \ | |
| # --use_precomputed_features \ | |
| # --features_dir /mnt/VideoGen/dataset/OpenVid1M/extracted_features \ | |
| # --text_encoder_architecture umt5-xxl \ | |
| # --wan_pretrained_path Wan-AI/Wan2.1-T2V-1.3B \ | |
| # --training_from_scratch True \ | |
| # --pretrained_model_name_or_path "dummy" \ | |
| # --wan_backbone_lr_ratio 1 \ | |
| # --num_frames 4 \ | |
| # --video_height 256 \ | |
| # --video_width 448 \ | |
| # --dataloader_num_workers 8 \ | |
| # --video_tokenizer_model_id "Cosmos-0.1-Tokenizer-DV4x8x8" \ | |
| # --instance_dataset OpenVid1MDataset \ | |
| # --instance_data_dir "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \ | |
| # --train_batch_size 1 \ | |
| # --gradient_accumulation_steps 1 \ | |
| # --learning_rate 3e-4 \ | |
| # --max_train_steps 10000 \ | |
| # --checkpointing_steps 500 \ | |
| # --validation_steps 500 \ | |
| # --logging_steps 10 \ | |
| # --validation_prompts "a cat playing" "a girl walking" \ | |
| # --output_dir "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp" \ | |
| # --mixed_precision bf16 \ | |
| # --lr_scheduler constant \ | |
| # --lr_warmup_steps 0 \ | |
| # --use_8bit_adam \ | |
| # --gradient_checkpointing \ | |
| # --min_masking_rate 0.0 \ | |
| # --cond_dropout_prob 0.1 \ | |
| # --split_vae_encode 1 \ | |
| # --allow_tf32 \ | |
| # --seed 42 \ | |
| # --report_to wandb | |
| # --pretrained_model_name_or_path "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000" \ |