RosticFACE commited on
Commit
6f20e8d
·
verified ·
1 Parent(s): 0988d71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +243 -234
app.py CHANGED
@@ -1,234 +1,243 @@
1
- import spaces
2
- import torch
3
- from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
4
- from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
5
- from diffusers.utils.export_utils import export_to_video
6
- import gradio as gr
7
- import tempfile
8
- import numpy as np
9
- from PIL import Image
10
- import random
11
- import gc
12
-
13
- from torchao.quantization import quantize_
14
- from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
15
- from torchao.quantization import Int8WeightOnlyConfig
16
-
17
- import aoti
18
-
19
-
20
- MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
21
-
22
- MAX_DIM = 832
23
- MIN_DIM = 480
24
- SQUARE_DIM = 640
25
- MULTIPLE_OF = 16
26
-
27
- MAX_SEED = np.iinfo(np.int32).max
28
-
29
- FIXED_FPS = 16 # можно поменять на 24, если хочешь более плавное видео
30
- MIN_FRAMES_MODEL = 8
31
-
32
- MIN_DURATION = round(MIN_FRAMES_MODEL / FIXED_FPS, 1)
33
- DEFAULT_DURATION = 5.0 # значение по умолчанию
34
-
35
- pipe = WanImageToVideoPipeline.from_pretrained(
36
- MODEL_ID,
37
- transformer=WanTransformer3DModel.from_pretrained(
38
- 'cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
39
- subfolder='transformer',
40
- torch_dtype=torch.bfloat16,
41
- device_map='cuda',
42
- ),
43
- transformer_2=WanTransformer3DModel.from_pretrained(
44
- 'cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
45
- subfolder='transformer_2',
46
- torch_dtype=torch.bfloat16,
47
- device_map='cuda',
48
- ),
49
- torch_dtype=torch.bfloat16,
50
- ).to('cuda')
51
-
52
- pipe.load_lora_weights(
53
- "Kijai/WanVideo_comfy",
54
- weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
55
- adapter_name="lightx2v"
56
- )
57
- kwargs_lora = {"load_into_transformer_2": True}
58
- pipe.load_lora_weights(
59
- "Kijai/WanVideo_comfy",
60
- weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
61
- adapter_name="lightx2v_2",
62
- **kwargs_lora
63
- )
64
- pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1., 1.])
65
- pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3., components=["transformer"])
66
- pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1., components=["transformer_2"])
67
- pipe.unload_lora_weights()
68
-
69
- quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
70
- quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig())
71
- quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig())
72
-
73
- aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da')
74
- aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da')
75
-
76
-
77
- default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
78
- default_negative_prompt = (
79
- "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, "
80
- "整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, "
81
- "画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, "
82
- "静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
83
- )
84
-
85
-
86
- # Исправленная функция resize_image (больше не вызовет ошибок)
87
- def resize_image(image: Image.Image) -> Image.Image:
88
- width, height = image.size
89
-
90
- if width == height:
91
- return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS)
92
-
93
- aspect_ratio = width / height
94
- MAX_ASPECT_RATIO = MAX_DIM / MIN_DIM
95
- MIN_ASPECT_RATIO = MIN_DIM / MAX_DIM
96
-
97
- # Устанавливаем безопасные значения по умолчанию
98
- target_w, target_h = width, height
99
- image_to_resize = image
100
-
101
- if aspect_ratio > MAX_ASPECT_RATIO:
102
- crop_width = int(round(height * MAX_ASPECT_RATIO))
103
- left = (width - crop_width) // 2
104
- image_to_resize = image.crop((left, 0, left + crop_width, height))
105
- target_w = MAX_DIM
106
- target_h = int(round(target_w / MAX_ASPECT_RATIO))
107
- elif aspect_ratio < MIN_ASPECT_RATIO:
108
- crop_height = int(round(width / MIN_ASPECT_RATIO))
109
- top = (height - crop_height) // 2
110
- image_to_resize = image.crop((0, top, width, top + crop_height))
111
- target_h = MAX_DIM
112
- target_w = int(round(target_h * MIN_ASPECT_RATIO))
113
- else:
114
- if width > height:
115
- target_w = MAX_DIM
116
- target_h = int(round(target_w / aspect_ratio))
117
- else:
118
- target_h = MAX_DIM
119
- target_w = int(round(target_h * aspect_ratio))
120
-
121
- final_w = round(target_w / MULTIPLE_OF) * MULTIPLE_OF
122
- final_h = round(target_h / MULTIPLE_OF) * MULTIPLE_OF
123
- final_w = max(MIN_DIM, min(MAX_DIM, final_w))
124
- final_h = max(MIN_DIM, min(MAX_DIM, final_h))
125
- return image_to_resize.resize((final_w, final_h), Image.LANCZOS)
126
-
127
-
128
- def get_num_frames(duration_seconds: float):
129
- # убрано ограничение на MAX_FRAMES_MODEL
130
- return 1 + int(round(duration_seconds * FIXED_FPS))
131
-
132
-
133
- def get_duration(
134
- input_image,
135
- prompt,
136
- steps,
137
- negative_prompt,
138
- duration_seconds,
139
- guidance_scale,
140
- guidance_scale_2,
141
- seed,
142
- randomize_seed,
143
- progress,
144
- ):
145
- BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624
146
- BASE_STEP_DURATION = 15
147
- width, height = resize_image(input_image).size
148
- frames = get_num_frames(duration_seconds)
149
- factor = frames * width * height / BASE_FRAMES_HEIGHT_WIDTH
150
- step_duration = BASE_STEP_DURATION * factor ** 1.5
151
- return 10 + int(steps) * step_duration
152
-
153
-
154
- @spaces.GPU(duration=get_duration)
155
- def generate_video(
156
- input_image,
157
- prompt,
158
- steps=4,
159
- negative_prompt=default_negative_prompt,
160
- duration_seconds=DEFAULT_DURATION,
161
- guidance_scale=1,
162
- guidance_scale_2=1,
163
- seed=42,
164
- randomize_seed=False,
165
- progress=gr.Progress(track_tqdm=True),
166
- ):
167
- if input_image is None:
168
- raise gr.Error("Please upload an input image.")
169
-
170
- num_frames = get_num_frames(duration_seconds)
171
- current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
172
- resized_image = resize_image(input_image)
173
-
174
- output_frames_list = pipe(
175
- image=resized_image,
176
- prompt=prompt,
177
- negative_prompt=negative_prompt,
178
- height=resized_image.height,
179
- width=resized_image.width,
180
- num_frames=num_frames,
181
- guidance_scale=float(guidance_scale),
182
- guidance_scale_2=float(guidance_scale_2),
183
- num_inference_steps=int(steps),
184
- generator=torch.Generator(device="cuda").manual_seed(current_seed),
185
- ).frames[0]
186
-
187
- with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
188
- video_path = tmpfile.name
189
-
190
- export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
191
- return video_path, current_seed
192
-
193
-
194
- # --- Gradio Interface ---
195
- with gr.Blocks() as demo:
196
- gr.Markdown("# 🚀 Wan 2.2 I2V (14B) Unlimited Duration Edition 🕒")
197
- gr.Markdown("Generate cinematic I2V animations without duration limits. Optimized for RTX 4090.")
198
-
199
- with gr.Row():
200
- with gr.Column():
201
- input_image_component = gr.Image(type="pil", label="Input Image")
202
- prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
203
-
204
- duration_seconds_input = gr.Slider(
205
- minimum=0.5,
206
- maximum=60.0, # можно поднять до 120.0
207
- step=0.5,
208
- value=DEFAULT_DURATION,
209
- label="Duration (seconds)",
210
- info=f"Each second = {FIXED_FPS} frames. Longer videos require more VRAM/time."
211
- )
212
-
213
- with gr.Accordion("Advanced Settings", open=False):
214
- negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
215
- seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
216
- randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
217
- steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=6, label="Inference Steps")
218
- guidance_scale_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale - high noise stage")
219
- guidance_scale_2_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale 2 - low noise stage")
220
-
221
- generate_button = gr.Button("Generate Video", variant="primary")
222
- with gr.Column():
223
- video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
224
-
225
- ui_inputs = [
226
- input_image_component, prompt_input, steps_slider,
227
- negative_prompt_input, duration_seconds_input,
228
- guidance_scale_input, guidance_scale_2_input,
229
- seed_input, randomize_seed_checkbox
230
- ]
231
- generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
232
-
233
- if __name__ == "__main__":
234
- demo.queue().launch(mcp_server=True)
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import torch
3
+ from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
4
+ from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
5
+ from diffusers.utils.export_utils import export_to_video
6
+ import gradio as gr
7
+ import tempfile
8
+ import numpy as np
9
+ from PIL import Image
10
+ import random
11
+ import gc
12
+
13
+ from torchao.quantization import quantize_
14
+ from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
15
+ from torchao.quantization import Int8WeightOnlyConfig
16
+
17
+ import aoti
18
+
19
+
20
+ MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
21
+
22
+ MAX_DIM = 832
23
+ MIN_DIM = 480
24
+ SQUARE_DIM = 640
25
+ MULTIPLE_OF = 16
26
+
27
+ MAX_SEED = np.iinfo(np.int32).max
28
+
29
+ FIXED_FPS = 16
30
+ MIN_FRAMES_MODEL = 8
31
+
32
+ MIN_DURATION = round(MIN_FRAMES_MODEL / FIXED_FPS, 1)
33
+ DEFAULT_DURATION = 5.0
34
+
35
+ pipe = WanImageToVideoPipeline.from_pretrained(
36
+ MODEL_ID,
37
+ transformer=WanTransformer3DModel.from_pretrained(
38
+ 'cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
39
+ subfolder='transformer',
40
+ torch_dtype=torch.bfloat16,
41
+ # 🚨 ИСПРАВЛЕНИЕ 1: Используем 'auto' для распределения по всем GPU
42
+ device_map='auto',
43
+ ),
44
+ transformer_2=WanTransformer3DModel.from_pretrained(
45
+ 'cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
46
+ subfolder='transformer_2',
47
+ torch_dtype=torch.bfloat16,
48
+ # 🚨 ИСПРАВЛЕНИЕ 2: Используем 'auto' для распределения по всем GPU
49
+ device_map='auto',
50
+ ),
51
+ torch_dtype=torch.bfloat16,
52
+ # 🚨 ИСПРАВЛЕНИЕ 3: Удаляем .to('cuda'), так как это отменяет распределение
53
+ )
54
+
55
+ pipe.load_lora_weights(
56
+ "Kijai/WanVideo_comfy",
57
+ weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
58
+ adapter_name="lightx2v"
59
+ )
60
+ kwargs_lora = {"load_into_transformer_2": True}
61
+ pipe.load_lora_weights(
62
+ "Kijai/WanVideo_comfy",
63
+ weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
64
+ adapter_name="lightx2v_2",
65
+ **kwargs_lora
66
+ )
67
+ pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1., 1.])
68
+ pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3., components=["transformer"])
69
+ pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1., components=["transformer_2"])
70
+ pipe.unload_lora_weights()
71
+
72
+ # ВАЖНО: Квантизация и AOTI должны выполняться после загрузки весов,
73
+ # но до того, как модель будет перемещена куда-либо, что и происходит.
74
+ quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
75
+ quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig())
76
+ quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig())
77
+
78
+ # Перемещаем на CUDA после квантизации и распределения (device_map='auto' позаботится о распределении)
79
+ pipe.cuda()
80
+
81
+ aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da')
82
+ aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da')
83
+
84
+
85
+ default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
86
+ default_negative_prompt = (
87
+ "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, "
88
+ "整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, "
89
+ "画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, "
90
+ "静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
91
+ )
92
+
93
+
94
+ # Исправленная функция resize_image (больше не вызовет ошибок)
95
+ def resize_image(image: Image.Image) -> Image.Image:
96
+ width, height = image.size
97
+
98
+ if width == height:
99
+ return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS)
100
+
101
+ aspect_ratio = width / height
102
+ MAX_ASPECT_RATIO = MAX_DIM / MIN_DIM
103
+ MIN_ASPECT_RATIO = MIN_DIM / MAX_DIM
104
+
105
+ # Устанавливаем безопасные значения по умолчанию
106
+ target_w, target_h = width, height
107
+ image_to_resize = image
108
+
109
+ if aspect_ratio > MAX_ASPECT_RATIO:
110
+ crop_width = int(round(height * MAX_ASPECT_RATIO))
111
+ left = (width - crop_width) // 2
112
+ image_to_resize = image.crop((left, 0, left + crop_width, height))
113
+ target_w = MAX_DIM
114
+ target_h = int(round(target_w / MAX_ASPECT_RATIO))
115
+ elif aspect_ratio < MIN_ASPECT_RATIO:
116
+ crop_height = int(round(width / MIN_ASPECT_RATIO))
117
+ top = (height - crop_height) // 2
118
+ image_to_resize = image.crop((0, top, width, top + crop_height))
119
+ target_h = MAX_DIM
120
+ target_w = int(round(target_h * MIN_ASPECT_RATIO))
121
+ else:
122
+ if width > height:
123
+ target_w = MAX_DIM
124
+ target_h = int(round(target_w / aspect_ratio))
125
+ else:
126
+ target_h = MAX_DIM
127
+ target_w = int(round(target_h * aspect_ratio))
128
+
129
+ final_w = round(target_w / MULTIPLE_OF) * MULTIPLE_OF
130
+ final_h = round(target_h / MULTIPLE_OF) * MULTIPLE_OF
131
+ final_w = max(MIN_DIM, min(MAX_DIM, final_w))
132
+ final_h = max(MIN_DIM, min(MAX_DIM, final_h))
133
+ return image_to_resize.resize((final_w, final_h), Image.LANCZOS)
134
+
135
+
136
+ def get_num_frames(duration_seconds: float):
137
+ return 1 + int(round(duration_seconds * FIXED_FPS))
138
+
139
+
140
+ def get_duration(
141
+ input_image,
142
+ prompt,
143
+ steps,
144
+ negative_prompt,
145
+ duration_seconds,
146
+ guidance_scale,
147
+ guidance_scale_2,
148
+ seed,
149
+ randomize_seed,
150
+ progress,
151
+ ):
152
+ BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624
153
+ BASE_STEP_DURATION = 15
154
+ width, height = resize_image(input_image).size
155
+ frames = get_num_frames(duration_seconds)
156
+ factor = frames * width * height / BASE_FRAMES_HEIGHT_WIDTH
157
+ step_duration = BASE_STEP_DURATION * factor ** 1.5
158
+ return 10 + int(steps) * step_duration
159
+
160
+
161
+ @spaces.GPU(duration=get_duration)
162
+ def generate_video(
163
+ input_image,
164
+ prompt,
165
+ steps=4,
166
+ negative_prompt=default_negative_prompt,
167
+ duration_seconds=DEFAULT_DURATION,
168
+ guidance_scale=1,
169
+ guidance_scale_2=1,
170
+ seed=42,
171
+ randomize_seed=False,
172
+ progress=gr.Progress(track_tqdm=True),
173
+ ):
174
+ if input_image is None:
175
+ raise gr.Error("Please upload an input image.")
176
+
177
+ num_frames = get_num_frames(duration_seconds)
178
+ current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
179
+ resized_image = resize_image(input_image)
180
+
181
+ # ВАЖНО: pipe уже распределен по GPU, поэтому явное указание 'cuda' не требуется,
182
+ # но Generator все равно должен быть на 'cuda'.
183
+ output_frames_list = pipe(
184
+ image=resized_image,
185
+ prompt=prompt,
186
+ negative_prompt=negative_prompt,
187
+ height=resized_image.height,
188
+ width=resized_image.width,
189
+ num_frames=num_frames,
190
+ guidance_scale=float(guidance_scale),
191
+ guidance_scale_2=float(guidance_scale_2),
192
+ num_inference_steps=int(steps),
193
+ generator=torch.Generator(device="cuda").manual_seed(current_seed),
194
+ ).frames[0]
195
+
196
+ with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
197
+ video_path = tmpfile.name
198
+
199
+ export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
200
+ return video_path, current_seed
201
+
202
+
203
+ # --- Gradio Interface ---
204
+ with gr.Blocks() as demo:
205
+ gr.Markdown("# 🚀 Wan 2.2 I2V (14B) — Unlimited Duration Edition 🕒")
206
+ gr.Markdown("Generate cinematic I2V animations without duration limits. Optimized for 4x NVIDIA L40S.")
207
+
208
+ with gr.Row():
209
+ with gr.Column():
210
+ input_image_component = gr.Image(type="pil", label="Input Image")
211
+ prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
212
+
213
+ duration_seconds_input = gr.Slider(
214
+ minimum=0.5,
215
+ maximum=60.0,
216
+ step=0.5,
217
+ value=DEFAULT_DURATION,
218
+ label="Duration (seconds)",
219
+ info=f"Each second = {FIXED_FPS} frames. Longer videos require more VRAM/time."
220
+ )
221
+
222
+ with gr.Accordion("Advanced Settings", open=False):
223
+ negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
224
+ seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
225
+ randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
226
+ steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=6, label="Inference Steps")
227
+ guidance_scale_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale - high noise stage")
228
+ guidance_scale_2_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale 2 - low noise stage")
229
+
230
+ generate_button = gr.Button("Generate Video", variant="primary")
231
+ with gr.Column():
232
+ video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
233
+
234
+ ui_inputs = [
235
+ input_image_component, prompt_input, steps_slider,
236
+ negative_prompt_input, duration_seconds_input,
237
+ guidance_scale_input, guidance_scale_2_input,
238
+ seed_input, randomize_seed_checkbox
239
+ ]
240
+ generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
241
+
242
+ if __name__ == "__main__":
243
+ demo.queue().launch(mcp_server=True)