File size: 2,567 Bytes
0529dd9
 
 
29231a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
---

license: apache-2.0
---


```

import torch

from transformers import AutoTokenizer, UMT5EncoderModel

from diffusers import AutoencoderKLWan, WanPipeline, WanTransformer3DModel, FlowMatchEulerDiscreteScheduler

from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler

from diffusers.utils import export_to_video

from torchvision import transforms

import os

import cv2

import numpy as np





from pathlib import Path

import json

from safetensors.torch import safe_open



device = "cuda"

seed = 0



# TODO: impl AutoencoderKLWan

vae = vae.from_pretrained("StevenZhang/Wan2.1-VAE_Diff")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vae = vae.to(device)



# TODO: impl FlowDPMSolverMultistepScheduler

scheduler = UniPCMultistepScheduler(prediction_type='flow_prediction', use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=1.0)



text_encoder = UMT5EncoderModel.from_pretrained("google/umt5-xxl", torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")



# 14B

transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-14B-Diff', torch_dtype=torch.bfloat16)

# transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-1.3B-Diff', torch_dtype=torch.bfloat16)



components = {

    "transformer": transformer,

    "vae": vae,

    "scheduler": scheduler,

    "text_encoder": text_encoder,

    "tokenizer": tokenizer,

}

pipe = WanPipeline(**components)



pipe.to(device)



negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'



generator = torch.Generator(device=device).manual_seed(seed)

inputs = {

    "prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战",

    "negative_prompt": negative_prompt, # TODO

    "generator": generator,

    "num_inference_steps": 50,

    "flow_shift": 3.0,

    "guidance_scale": 5.0,

    "height": 480,

    "width": 832,

    "num_frames": 81,

    "max_sequence_length": 512,

    "output_type": "np"

}



video = pipe(**inputs).frames[0]



print(video.shape)



export_to_video(video, "output.mp4", fps=16)

```