| | import torch |
| | from transformers import AutoTokenizer, UMT5EncoderModel |
| | from diffusers import AutoencoderKLWan, WanPipeline, WanTransformer3DModel, FlowMatchEulerDiscreteScheduler |
| | from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler |
| | from diffusers.utils import export_to_video |
| | from torchvision import transforms |
| | import os |
| | import cv2 |
| | import numpy as np |
| |
|
| |
|
| | from pathlib import Path |
| | import json |
| | from safetensors.torch import safe_open |
| |
|
| | device = "cuda" |
| | seed = 0 |
| |
|
| | |
| | vae = vae.from_pretrained("StevenZhang/Wan2.1-VAE_Diff") |
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | vae = vae.to(device) |
| |
|
| | |
| | scheduler = UniPCMultistepScheduler(prediction_type='flow_prediction', use_flow_sigmas=True, num_train_timesteps=1000, flow_shift=1.0) |
| |
|
| | text_encoder = UMT5EncoderModel.from_pretrained("google/umt5-xxl", torch_dtype=torch.bfloat16) |
| | tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl") |
| |
|
| | |
| | transformer = WanTransformer3DModel.from_pretrained('StevenZhang/Wan2.1-T2V-14B-Diff', torch_dtype=torch.bfloat16) |
| | |
| |
|
| | components = { |
| | "transformer": transformer, |
| | "vae": vae, |
| | "scheduler": scheduler, |
| | "text_encoder": text_encoder, |
| | "tokenizer": tokenizer, |
| | } |
| | pipe = WanPipeline(**components) |
| |
|
| | pipe.to(device) |
| |
|
| | negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走' |
| |
|
| | generator = torch.Generator(device=device).manual_seed(seed) |
| | inputs = { |
| | "prompt": "两只拟人化的猫咪身穿舒适的拳击装备,戴着鲜艳的手套,在聚光灯照射的舞台上激烈对战", |
| | "negative_prompt": negative_prompt, |
| | "generator": generator, |
| | "num_inference_steps": 50, |
| | "flow_shift": 3.0, |
| | "guidance_scale": 5.0, |
| | "height": 480, |
| | "width": 832, |
| | "num_frames": 81, |
| | "max_sequence_length": 512, |
| | "output_type": "np" |
| | } |
| |
|
| | video = pipe(**inputs).frames[0] |
| |
|
| | print(video.shape) |
| |
|
| | export_to_video(video, "output.mp4", fps=16) |