Open-Sora-Plan-v1.5.0 / README_cn.md
LanguageBind's picture
Upload 4 files
25212bb verified

Open-Sora Plan v1.5.0采用mindspeed-mm套件训练。

前置要求

Open-Sora Plan v1.5.0在CANN 8.0.1版本完成训练,请参照CANN 系列 昇腾计算 8.0.1 软件补丁下载安装。

环境安装

1、安装torch、Mindspeed

# python3.8
conda create -n osp python=3.8
conda activate osp

# 安装 torch 和 torch_npu,注意要选择对应python版本、x86或arm的torch、torch_npu及apex包
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl

# apex for Ascend 参考 https://gitee.com/ascend/apex
# 建议从原仓编译安装

# 将shell脚本中的环境变量路径修改为真实路径,下面为参考路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh

# 安装加速库
git clone https://gitee.com/ascend/MindSpeed.git
cd MindSpeed
git checkout 59b4e983b7dc1f537f8c6b97a57e54f0316fafb0
pip install -r requirements.txt
pip3 install -e .
cd ..

# 安装其余依赖库
pip install -e .

2、安装decord

git clone --recursive https://github.com/dmlc/decord
mkdir build && cd build 
cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release -DFFMPEG_DIR=/usr/local/ffmpeg 
make 
cd ../python 
pwd=$PWD 
echo "PYTHONPATH=$PYTHONPATH:$pwd" >> ~/.bashrc 
source ~/.bashrc 
python3 setup.py install --user

权重下载

魔乐社区:

https://modelers.cn/models/PKU-YUAN-Group/Open-Sora-Plan-v1.5.0

huggingface:

https://huggingface.co/LanguageBind/Open-Sora-Plan-v1.5.0

T5:

google/t5-v1_1-xl · Hugging Face

CLIP:

laion/CLIP-ViT-bigG-14-laion2B-39B-b160k · Hugging Face

Train Text-to-Video

需要设置好data.json和model_opensoraplan1_5.json。

data.json:

{
    "dataset_param": {
        "dataset_type": "t2v",
        "basic_parameters": {
            "data_path": "./examples/opensoraplan1.5/data.txt", # 数据路径
            "data_folder": "",
            "data_storage_mode": "combine"
        },
        "preprocess_parameters": {
            "video_reader_type": "decoder",
            "image_reader_type": "Image",
            "num_frames": 121, 
            "frame_interval": 1,
            "max_height": 576, # 开启固定分辨率时的样本高度,在开启多分辨率时无效
            "max_width": 1024, # 开启固定分辨率时的样本宽度,在开启多分辨率时无效
            "max_hxw": 589824, # 开启多分辨率时的最大token数
            "min_hxw": 589824, # 开启多分辨率时的最小token数。此外,min_hxw需要在开启force_resolution时设置为max_height * max_width以过滤低分辨率样本,或自定义更严格的筛选标准
            "force_resolution": true, # 开启固定分辨率训练
            "force_5_ratio": false, # 开启5宽高比多分辨率策略训练
            "max_h_div_w_ratio": 1.0, # 筛选最大高宽比
            "min_h_div_w_ratio": 0.42, # 筛选最小高宽比
            "hw_stride": 16,
            "ae_stride_t": 8,
            "train_fps": 24, # 训练时采样fps,会将不同fps的视频都重采样到train_fps
            "speed_factor": 1.0,
            "drop_short_ratio": 1.0,
            "min_num_frames": 29,
            "cfg": 0.1,
            "batch_size": 1,
            "gradient_accumulation_size": 4,
            "use_aesthetic": false,
            "train_pipeline": {
                "video": [{
                        "trans_type": "ToTensorVideo"
                    },
                    {
                        "trans_type": "CenterCropResizeVideo",
                        "param": {
                            "size": [576, 1024],
                            "interpolation_mode": "bicubic"
                        }
                    },
                    {
                        "trans_type": "ae_norm"
                    }
                ],
                "image": [{
                    "trans_type": "ToTensorVideo"
                    },
                    {
                        "trans_type": "CenterCropResizeVideo",
                        "param": {
                            "size": [576, 1024],
                            "interpolation_mode": "bicubic"
                        }
                    },
                    {
                        "trans_type": "ae_norm"
                    }
                ]
            }
        },
        "use_text_processer": true,
        "enable_text_preprocess": true,
        "model_max_length": 512,
        "tokenizer_config": {
            "hub_backend": "hf",
            "autotokenizer_name": "AutoTokenizer",
            "from_pretrained": "/work/share/checkpoint/pretrained/t5/t5-v1_1-xl"
        },
        "tokenizer_config_2": {
            "hub_backend": "hf",
            "autotokenizer_name": "AutoTokenizer",
            "from_pretrained": "/work/share/checkpoint/pretrained/clip/models--laion--CLIP-ViT-bigG-14-laion2B-39B-b160k/snapshots/bc7788f151930d91b58474715fdce5524ad9a189"
        },
        "use_feature_data": false,
        "use_img_from_vid": false
    },
    "dataloader_param": {
        "dataloader_mode": "sampler",
        "sampler_type": "LengthGroupedSampler", # 开启Group Data策略,默认指定
        "batch_size": 1,
        "num_workers": 4,
        "shuffle": false,
        "drop_last": true,
        "pin_memory": false,
        "group_data": true,
        "initial_global_step_for_sampler": 0, 
        "gradient_accumulation_size": 4,
        "collate_param": {
            "model_name": "GroupLength", # 开启Group Data对应的Collate,默认指定
            "batch_size": 1,
            "num_frames": 121,
            "group_data": true,
            "ae_stride": 8,
            "ae_stride_t": 8,
            "patch_size": 2,
            "patch_size_t": 1
        }
    }
}

model_opensoraplan1_5.json

{
    "frames": 121,
    "allow_tf32": false,
    "allow_internal_format": false,
    "load_video_features": false,
    "load_text_features": false,
    "enable_encoder_dp": true, # mindspeed架构优化,在TP并行度大于1时起作用
    "weight_dtype": "bf16",
    "ae": {
        "model_id": "wfvae",
        "base_channels": 160,
        "connect_res_layer_num": 1,
        "decoder_energy_flow_hidden_size": 128,
        "decoder_num_resblocks": 2,
        "dropout": 0.0,
        "encoder_energy_flow_hidden_size": 128,
        "encoder_num_resblocks": 2,
        "l1_dowmsample_block": "Spatial2xTime2x3DDownsample",
        "l1_downsample_wavelet": "HaarWaveletTransform3D",
        "l1_upsample_block": "Spatial2xTime2x3DUpsample",
        "l1_upsample_wavelet": "InverseHaarWaveletTransform3D",
        "l2_dowmsample_block": "Spatial2xTime2x3DDownsample",
        "l2_downsample_wavelet": "HaarWaveletTransform3D",
        "l2_upsample_block": "Spatial2xTime2x3DUpsample",
        "l2_upsample_wavelet": "InverseHaarWaveletTransform3D",
        "latent_dim": 32,
        "norm_type": "layernorm",
        "scale": [0.7031, 0.7109, 1.5391, 1.2969, 0.7109, 1.4141, 1.3828, 2.1719, 1.7266,
        1.8281, 1.9141, 1.2031, 0.6875, 0.9609, 1.6484, 1.1875, 1.5312, 1.1328,
        0.8828, 0.6836, 0.8828, 0.9219, 1.6953, 1.4453, 1.5312, 0.6836, 0.7656,
        0.8242, 1.2344, 1.0312, 1.7266, 0.9492],
        "shift": [-0.2129,  0.1226,  1.6328,  0.6211, -0.8750,  0.6172, -0.5703,  0.1348,
        -0.2178, -0.9375,  0.3184,  0.3281, -0.0544, -0.1826, -0.2812,  0.4355,
         0.1621, -0.2578,  0.7148, -0.7422, -0.2295, -0.2324, -1.4922,  0.6328,
         1.1250, -0.2578, -2.1094,  1.0391,  1.1797, -1.2422, -0.2988, -0.9570],
        "t_interpolation": "trilinear",
        "use_attention": true,
        "use_tiling": true, # 是否开启tiling策略
        "from_pretrained": "/work/share/checkpoint/pretrained/vae/Middle888/merged.ckpt",
        "dtype": "fp32"
      },
    "text_encoder": {
        "hub_backend": "hf",
        "model_id": "T5",
        "from_pretrained": "/work/share/checkpoint/pretrained/t5/t5-v1_1-xl",
        "low_cpu_mem_usage": false
    },
    "text_encoder_2":{
        "hub_backend": "hf",
        "model_id": "CLIPWithProjection", 
        "from_pretrained": "/work/share/checkpoint/pretrained/clip/models--laion--CLIP-ViT-bigG-14-laion2B-39B-b160k/snapshots/bc7788f151930d91b58474715fdce5524ad9a189",
        "low_cpu_mem_usage": false
    },
    "predictor": {
        "model_id": "SparseUMMDiT",
        "num_layers": [2, 4, 6, 8, 6, 4, 2], # 每个stage的层数
        "sparse_n": [1, 2, 4, 8, 4, 2, 1], # 每个stage的稀疏度
        "double_ff": true, # 采用visual和text共享FFN还是各自独立FFN
        "sparse1d": true, # 是否采用Skiparse策略,设置为false则为dense dit
        "num_heads": 24,
        "head_dim": 128,
        "in_channels": 32,
        "out_channels": 32,
        "timestep_embed_dim": 1024,
        "caption_channels": 2048,
        "pooled_projection_dim": 1280,
        "skip_connection": true, # 是否添加skip connection
        "dropout": 0.0, 
        "attention_bias": true,
        "patch_size": 2,
        "patch_size_t": 1,
        "activation_fn": "gelu-approximate",
        "norm_elementwise_affine": false,
        "norm_eps": 1e-06,
        "from_pretrained": null # 预训练权重路径,需采用合并后的权重
    },
    "diffusion": {
        "model_id": "OpenSoraPlan",
        "weighting_scheme": "logit_normal",
        "use_dynamic_shifting": true 
    }
}

进入Open-Sora Plan目录下,运行

bash examples/opensoraplan1.5/pretrain_opensoraplan1_5.sh

参数解析:

--optimizer-selection fused_ema_adamw 选择使用的优化器,我们这里需要选择fused_ema_adamw以获得EMA版本权重。

--model_custom_precision 不同组件使用不同的精度,而不是采用megatron默认的整网bf16精度。例如对VAE使用fp32精度,对text encoder、dit使用bf16精度。

--clip_grad_ema_decay 0.99 设置adaptive grad clipping中使用的EMA衰减率。

--selective_recom --recom_ffn_layers 32 是否开启选择性重加算及选择性重计算的层数。在开启选择性重计算时,我们只对FFN进行重计算而不对Attention进行重计算,以获得加速训练效果。该参数与--recompute-granularity full --recompute-method block --recompute-num-layers 0 互斥,当开启选择性重计算时,默认全重计算已关闭。

Sample Text-to-Video

由于模型训练时进行了TP切分,所以我们需要先将切分后的权重进行合并,然后再进行推理。

合并权重

python examples/opensoraplan1.5/convert_mm_to_ckpt.py --load_dir $load_dir --save_dir $save_dir --ema

参数解析:

--load_dir 训练时经过megatron切分后保存的权重路径

--save_dir 合并后的权重路径

--ema 是否采用EMA权重

推理

需要配置好inference_t2v_model1_5.json。

{
    "ae": {
        "model_id": "wfvae",
        "base_channels": 160,
        "connect_res_layer_num": 1,
        "decoder_energy_flow_hidden_size": 128,
        "decoder_num_resblocks": 2,
        "dropout": 0.0,
        "encoder_energy_flow_hidden_size": 128,
        "encoder_num_resblocks": 2,
        "l1_dowmsample_block": "Spatial2xTime2x3DDownsample",
        "l1_downsample_wavelet": "HaarWaveletTransform3D",
        "l1_upsample_block": "Spatial2xTime2x3DUpsample",
        "l1_upsample_wavelet": "InverseHaarWaveletTransform3D",
        "l2_dowmsample_block": "Spatial2xTime2x3DDownsample",
        "l2_downsample_wavelet": "HaarWaveletTransform3D",
        "l2_upsample_block": "Spatial2xTime2x3DUpsample",
        "l2_upsample_wavelet": "InverseHaarWaveletTransform3D",
        "latent_dim": 32,
        "vae_scale_factor": [8, 8, 8],
        "norm_type": "layernorm",
        "scale": [0.7031, 0.7109, 1.5391, 1.2969, 0.7109, 1.4141, 1.3828, 2.1719, 1.7266,
        1.8281, 1.9141, 1.2031, 0.6875, 0.9609, 1.6484, 1.1875, 1.5312, 1.1328,
        0.8828, 0.6836, 0.8828, 0.9219, 1.6953, 1.4453, 1.5312, 0.6836, 0.7656,
        0.8242, 1.2344, 1.0312, 1.7266, 0.9492],
        "shift": [-0.2129,  0.1226,  1.6328,  0.6211, -0.8750,  0.6172, -0.5703,  0.1348,
        -0.2178, -0.9375,  0.3184,  0.3281, -0.0544, -0.1826, -0.2812,  0.4355,
         0.1621, -0.2578,  0.7148, -0.7422, -0.2295, -0.2324, -1.4922,  0.6328,
         1.1250, -0.2578, -2.1094,  1.0391,  1.1797, -1.2422, -0.2988, -0.9570],
        "t_interpolation": "trilinear",
        "use_attention": true,
        "use_tiling": true, # 是否开启tiling策略,推理时默认开启节省显存
        "from_pretrained": "/work/share/checkpoint/pretrained/vae/Middle888/merged.ckpt",
        "dtype": "fp16"
      },
    "text_encoder": {
        "hub_backend": "hf",
        "model_id": "T5",
        "from_pretrained": "/work/share/checkpoint/pretrained/t5/t5-v1_1-xl",
        "low_cpu_mem_usage": false
    },
    "text_encoder_2":{
        "hub_backend": "hf",
        "model_id": "CLIPWithProjection", 
        "from_pretrained": "/work/share/checkpoint/pretrained/clip/models--laion--CLIP-ViT-bigG-14-laion2B-39B-b160k/snapshots/bc7788f151930d91b58474715fdce5524ad9a189",
        "low_cpu_mem_usage": false
    },
    "tokenizer":{
        "hub_backend": "hf",
        "autotokenizer_name": "AutoTokenizer",
        "from_pretrained": "/work/share/checkpoint/pretrained/t5/t5-v1_1-xl",
        "low_cpu_mem_usage": false
    },
    "tokenizer_2":{
        "hub_backend": "hf",
        "autotokenizer_name": "AutoTokenizer",
        "from_pretrained": "/work/share/checkpoint/pretrained/clip/models--laion--CLIP-ViT-bigG-14-laion2B-39B-b160k/snapshots/bc7788f151930d91b58474715fdce5524ad9a189",
        "low_cpu_mem_usage": false
    },
    "predictor": {
        "model_id": "SparseUMMDiT",
        "num_layers": [2, 4, 6, 8, 6, 4, 2],
        "sparse_n": [1, 2, 4, 8, 4, 2, 1],
        "double_ff": true,
        "sparse1d": true,
        "num_heads": 24,
        "head_dim": 128,
        "in_channels": 32,
        "out_channels": 32,
        "timestep_embed_dim": 1024,
        "caption_channels": 2048,
        "pooled_projection_dim": 1280,
        "skip_connection": true,
        "skip_connection_zero_init": true,
        "dropout": 0.0,
        "attention_bias": true,
        "patch_size": 2,
        "patch_size_t": 1,
        "activation_fn": "gelu-approximate",
        "norm_elementwise_affine": true,
        "norm_eps": 1e-06,
        "from_pretrained": "/path/to/pretrained/model"
    },
    "diffusion": {
        "model_id": "OpenSoraPlan",
        "num_inference_steps": 50, # 推理步数
        "guidance_scale": 8.0, # CFG强度,我们推荐较大的CFG,8.0是较好的值
        "guidance_rescale": 0.7, # guidance rescale强度,如认为采样饱和度过高,我们推荐将gudance_rescale增大,而非调整CFG
        "use_linear_quadratic_schedule": false, # 采用线性——平方采样策略
        "use_dynamic_shifting": false,
        "shift": 7.0 # 采用shifting采样策略
    },
    "pipeline_config": {
        "use_attention_mask": true,
        "input_size": [121, 576, 1024],
        "version": "v1.5",
        "model_type": "t2v"
    },
    "micro_batch_size": 1,
    "frame_interval":1,
    "model_max_length": 512,
    "save_path":"./opensoraplan_samples/test_samples",
    "fps":24,
    "prompt":"./examples/opensoraplan1.5/sora.txt",
    "device":"npu",
    "weight_dtype": "fp16"
}

进入Open-Sora Plan目录下,运行

bash examples/opensoraplan1.5/inference_t2v_1_5.sh

实测TP=1即不开启并行策略能够运行121x576x1024推理,如需加快推理速度请自行调节TP并行度。