43.oT_eV / Meissonic /train /train_video.sh
BryanW's picture
Upload code from /mnt/43.oT_eV
c2925de verified
#!/bin/bash
# 8-GPU training script for video diffusion model
# Usage: bash train/train_video.sh
accelerate launch --multi_gpu --gpu_ids '0,1,2,3,4,5,6,7' --main_process_port 25011 --num_processes 8 \
train/train_mei_video.py \
--use_precomputed_features \
--features_dir /mnt/VideoGen/dataset/OpenVid1M/extracted_features_17_128_128_with_mask_attn_debug \
--text_encoder_architecture umt5-xxl \
--wan_pretrained_path /mnt/Wan2.1-T2V-1.3B \
--training_from_scratch True \
--pretrained_model_name_or_path "dummy" \
--wan_backbone_lr_ratio 0.2 \
--num_frames 17 \
--video_height 128 \
--video_width 128 \
--dataloader_num_workers 8 \
--video_tokenizer_model_id "Cosmos-0.1-Tokenizer-DV4x8x8" \
--instance_dataset OpenVid1MDataset \
--instance_data_dir "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \
--train_batch_size 8 \
--gradient_accumulation_steps 4 \
--learning_rate 3e-4 \
--max_train_steps 10000 \
--checkpointing_steps 500 \
--validation_steps 500 \
--logging_steps 10 \
--validation_prompts "a cat playing" "a girl walking" \
--output_dir "./output_128x128_17f_2*4bs_4*8*8vqvae_0_2_ratio" \
--mixed_precision bf16 \
--lr_scheduler constant \
--lr_warmup_steps 0 \
--use_8bit_adam \
--gradient_checkpointing \
--min_masking_rate 0.0 \
--cond_dropout_prob 0.1 \
--split_vae_encode 1 \
--allow_tf32 \
--seed 42 \
--report_to wandb
# --use_precomputed_features \
# --features_dir /mnt/VideoGen/dataset/OpenVid1M/extracted_features \
# accelerate launch --multi_gpu --gpu_ids '0,1,2,3,4,5,6,7' --main_process_port 25011 --num_processes 8 \
# train/train_mei_video.py \
# --use_precomputed_features \
# --features_dir /mnt/VideoGen/dataset/OpenVid1M/extracted_features \
# --text_encoder_architecture umt5-xxl \
# --wan_pretrained_path Wan-AI/Wan2.1-T2V-1.3B \
# --training_from_scratch True \
# --pretrained_model_name_or_path "dummy" \
# --wan_backbone_lr_ratio 1 \
# --num_frames 4 \
# --video_height 256 \
# --video_width 448 \
# --dataloader_num_workers 8 \
# --video_tokenizer_model_id "Cosmos-0.1-Tokenizer-DV4x8x8" \
# --instance_dataset OpenVid1MDataset \
# --instance_data_dir "/mnt/VideoGen/dataset/OpenVid1M/video_reorg/OpenVid1M_reorganized.csv" \
# --train_batch_size 1 \
# --gradient_accumulation_steps 1 \
# --learning_rate 3e-4 \
# --max_train_steps 10000 \
# --checkpointing_steps 500 \
# --validation_steps 500 \
# --logging_steps 10 \
# --validation_prompts "a cat playing" "a girl walking" \
# --output_dir "./output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio_continue_tmp" \
# --mixed_precision bf16 \
# --lr_scheduler constant \
# --lr_warmup_steps 0 \
# --use_8bit_adam \
# --gradient_checkpointing \
# --min_masking_rate 0.0 \
# --cond_dropout_prob 0.1 \
# --split_vae_encode 1 \
# --allow_tf32 \
# --seed 42 \
# --report_to wandb
# --pretrained_model_name_or_path "/mnt/Meissonic/output_256x448_4f_2bs_4*8*8vqvae_0_00_ratio/checkpoint-4000" \