StevenZhang commited on
Commit
bd871a6
·
1 Parent(s): 2220f8a

init upload

Browse files
README.md CHANGED
@@ -1,3 +1,67 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ ```
6
+ from diffusers.utils import load_image, export_to_video
7
+ from transformers import CLIPVisionModel, CLIPImageProcessor, UMT5EncoderModel
8
+ from diffusers import WanI2VPipeline, WanTransformer3DModel
9
+ import torch
10
+
11
+ pretrained_model_name_or_path = "./wan_i2v" # TODO replace with our hf id
12
+ image_encoder = CLIPVisionModel.from_pretrained(pretrained_model_name_or_path, subfolder='image_encoder',
13
+ torch_dtype=torch.float16)
14
+ transformer_i2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer_i2v_480p')
15
+ # for 720p
16
+ # transformer_i2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer_i2v_720p',
17
+ # torch_dtype=torch.bfloat16)
18
+
19
+ image_processor = CLIPImageProcessor.from_pretrained(pretrained_model_name_or_path, subfolder='image_processor')
20
+
21
+ text_encoder = UMT5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder',
22
+ torch_dtype=torch.bfloat16)
23
+
24
+ pipe = WanI2VPipeline.from_pretrained(
25
+ pretrained_model_name_or_path,
26
+ transformer=transformer_i2v,
27
+ text_encoder=text_encoder,
28
+ image_encoder=image_encoder,
29
+ image_processor=image_processor,
30
+ )
31
+
32
+ image = load_image(
33
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
34
+ )
35
+ device = "cuda"
36
+ seed = 0
37
+ prompt = ("An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
38
+ "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot.")
39
+ generator = torch.Generator(device=device).manual_seed(seed)
40
+
41
+ # pipe.to(device)
42
+ pipe.enable_model_cpu_offload()
43
+
44
+ negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
45
+
46
+ inputs = {
47
+ 'image': image,
48
+ "prompt": prompt,
49
+ # 'max_area': 720 * 1280, # for 720p
50
+ "negative_prompt": negative_prompt,
51
+ 'max_area': 480 * 832,
52
+ "generator": generator,
53
+ "num_inference_steps": 40,
54
+ "guidance_scale": 5.0,
55
+ "num_frames": 81,
56
+ "max_sequence_length": 512,
57
+ "output_type": "np",
58
+ # 'flow_shift': 5.0, # for 720p
59
+ 'flow_shift': 3.0
60
+ }
61
+
62
+ output = pipe(**inputs).frames[0]
63
+
64
+ export_to_video(output, "output.mp4", fps=16)
65
+
66
+
67
+ ```
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanxTransformer3DModel",
3
+ "_diffusers_version": "0.33.0.dev0",
4
+ "_name_or_path": "xxx/wanx_i2v",
5
+ "add_img_emb": true,
6
+ "added_kv_proj_dim": 5120,
7
+ "attention_head_dim": 128,
8
+ "cross_attn_norm": true,
9
+ "eps": 1e-06,
10
+ "ffn_dim": 13824,
11
+ "freq_dim": 256,
12
+ "in_channels": 36,
13
+ "num_attention_heads": 40,
14
+ "num_layers": 40,
15
+ "out_channels": 16,
16
+ "patch_size": [
17
+ 1,
18
+ 2,
19
+ 2
20
+ ],
21
+ "qk_norm": true,
22
+ "text_dim": 4096,
23
+ "window_size": [
24
+ -1,
25
+ -1
26
+ ]
27
+ }
diffusion_pytorch_model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eabf12e68cb48deab28c6bbf8fa14582147b503b5697ed41784f8a6e5c971f6d
3
+ size 9874715888
diffusion_pytorch_model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4302812ae49a032745791299aab3f67ed1489fa22e0e8c91c910f757de190cf7
3
+ size 9975522816
diffusion_pytorch_model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94e56041030947eed32cde17aa359d20a4df7013fc91d5b0fbae2db35cd6b97a
3
+ size 9954503688
diffusion_pytorch_model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86b126a45b0436aa8cb02878411687ffdd0bbfa94f75f2f961af17af78249af7
3
+ size 2985562472
diffusion_pytorch_model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
wan_i2v_example.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from diffusers.utils import load_image, export_to_video
2
+ from transformers import CLIPVisionModel, CLIPImageProcessor, UMT5EncoderModel
3
+ from diffusers import WanI2VPipeline, WanTransformer3DModel
4
+ import torch
5
+
6
+ pretrained_model_name_or_path = "./wan_i2v" # TODO replace with our hf id
7
+ image_encoder = CLIPVisionModel.from_pretrained(pretrained_model_name_or_path, subfolder='image_encoder',
8
+ torch_dtype=torch.float16)
9
+ transformer_i2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer_i2v_480p')
10
+ # for 720p
11
+ # transformer_i2v = WanTransformer3DModel.from_pretrained(pretrained_model_name_or_path, subfolder='transformer_i2v_720p',
12
+ # torch_dtype=torch.bfloat16)
13
+
14
+ image_processor = CLIPImageProcessor.from_pretrained(pretrained_model_name_or_path, subfolder='image_processor')
15
+
16
+ text_encoder = UMT5EncoderModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder',
17
+ torch_dtype=torch.bfloat16)
18
+
19
+ pipe = WanI2VPipeline.from_pretrained(
20
+ pretrained_model_name_or_path,
21
+ transformer=transformer_i2v,
22
+ text_encoder=text_encoder,
23
+ image_encoder=image_encoder,
24
+ image_processor=image_processor,
25
+ )
26
+
27
+ image = load_image(
28
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
29
+ )
30
+ device = "cuda"
31
+ seed = 0
32
+ prompt = ("An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in "
33
+ "the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot.")
34
+ generator = torch.Generator(device=device).manual_seed(seed)
35
+
36
+ # pipe.to(device)
37
+ pipe.enable_model_cpu_offload()
38
+
39
+ negative_prompt = '色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走'
40
+
41
+ inputs = {
42
+ 'image': image,
43
+ "prompt": prompt,
44
+ # 'max_area': 720 * 1280, # for 720p
45
+ "negative_prompt": negative_prompt,
46
+ 'max_area': 480 * 832,
47
+ "generator": generator,
48
+ "num_inference_steps": 40,
49
+ "guidance_scale": 5.0,
50
+ "num_frames": 81,
51
+ "max_sequence_length": 512,
52
+ "output_type": "np",
53
+ # 'flow_shift': 5.0, # for 720p
54
+ 'flow_shift': 3.0
55
+ }
56
+
57
+ output = pipe(**inputs).frames[0]
58
+
59
+ export_to_video(output, "output.mp4", fps=16)