EdBanshee commited on
Commit
db19b41
·
1 Parent(s): 5c8c602

Too many issues, wipe with known working I2V code and re-added the frame sizing only

Browse files
Files changed (3) hide show
  1. app.py +16 -44
  2. optimization.py +21 -45
  3. requirements.txt +1 -1
app.py CHANGED
@@ -1,6 +1,6 @@
1
- # PyTorch 2.8 (temporary hack)
2
  import os
3
- os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9" spaces')
4
 
5
  # Actual demo code
6
  import spaces
@@ -33,6 +33,7 @@ MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1)
33
 
34
  DEFAULT_STEPS = 5
35
 
 
36
  pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
37
  transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
38
  subfolder='transformer',
@@ -55,29 +56,18 @@ for i in range(3):
55
  OPTIMIZE_WIDTH = 832
56
  OPTIMIZE_HEIGHT = 624
57
 
58
- # Optimize pipeline at startup (before ZeroGPU packing)
59
- print("🚀 Optimizing pipeline at startup...")
60
- try:
61
- optimize_pipeline_(pipe,
62
- image=Image.new('RGB', (OPTIMIZE_WIDTH, OPTIMIZE_HEIGHT)),
63
- prompt='prompt',
64
- height=OPTIMIZE_HEIGHT,
65
- width=OPTIMIZE_WIDTH,
66
- num_frames=MAX_FRAMES_MODEL,
67
- )
68
-
69
- pipeline_optimized = True
70
- print("✅ Pipeline optimization completed successfully at startup!")
71
- except Exception as e:
72
- pipeline_optimized = False
73
- print("❌ Pipeline optimization failed:", str(e))
74
- raise e
75
 
76
 
77
  default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
78
  default_negative_prompt = "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, 整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, 画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, 静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
79
 
80
-
81
  def resize_image(image: Image.Image, use_crop: bool = True) -> Image.Image:
82
  if image.height > image.width:
83
  transposed = image.transpose(Image.Transpose.ROTATE_90)
@@ -251,6 +241,7 @@ def make_cropping_preview(input_image: Image.Image, use_crop: bool = True) -> Im
251
  # Convert to RGB for Gradio
252
  return preview.convert('RGB')
253
 
 
254
  def get_duration(
255
  input_image,
256
  prompt,
@@ -315,7 +306,6 @@ def generate_video(
315
  gr.Error: If input_image is None (no image uploaded).
316
 
317
  Note:
318
- - The function automatically resizes the input image to the target dimensions
319
  - Frame count is calculated as duration_seconds * FIXED_FPS (24)
320
  - Output dimensions are adjusted to be multiples of MOD_VALUE (32)
321
  - The function uses GPU acceleration via the @spaces.GPU decorator
@@ -324,9 +314,6 @@ def generate_video(
324
  if input_image is None:
325
  raise gr.Error("Please upload an input image.")
326
 
327
- # Pipeline was optimized at startup, proceed with generation
328
- progress(0.1, "Starting video generation...")
329
-
330
  num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
331
  current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
332
  use_crop = resize_mode == "CROP SOURCE"
@@ -363,18 +350,17 @@ with gr.Blocks() as demo:
363
  return "(no git info)", "(no git info)"
364
 
365
  latest_hash, latest_msg = get_git_info()
366
- gr.Markdown(f"**Git commit:** `{latest_hash}`\n> {latest_msg}")
367
  gr.Markdown("# Fast 4 steps Wan 2.2 I2V (14B) with Lightning LoRA")
368
  gr.Markdown("run Wan 2.2 in just 4-8 steps, with [Lightning LoRA](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Wan22-Lightning), fp8 quantization & AoT compilation - compatible with 🧨 diffusers and ZeroGPU⚡️")
369
- gr.Markdown("🚀 **Pipeline optimized at startup!** Fast video generation ready.")
370
  with gr.Row():
371
  with gr.Column():
372
  input_image_component = gr.Image(type="pil", label="Input Image (auto-resized to target H/W)")
373
  resize_mode = gr.Radio(choices=["CROP SOURCE", "RESIZE SOURCE"], value="CROP SOURCE", label="Image Preparation Mode")
374
  cropping_preview_component = gr.Image(type="pil", label="Processing Preview", interactive=False)
375
  prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
376
- duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
377
-
378
  with gr.Accordion("Advanced Settings", open=False):
379
  negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
380
  seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
@@ -386,7 +372,7 @@ with gr.Blocks() as demo:
386
  generate_button = gr.Button("Generate Video", variant="primary")
387
  with gr.Column():
388
  video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
389
-
390
  ui_inputs = [
391
  input_image_component, prompt_input, steps_slider,
392
  negative_prompt_input, duration_seconds_input,
@@ -425,29 +411,15 @@ with gr.Blocks() as demo:
425
  "wan22_input_2.jpg",
426
  "A sleek lunar vehicle glides into view from left to right, kicking up moon dust as astronauts in white spacesuits hop aboard with characteristic lunar bouncing movements. In the distant background, a VTOL craft descends straight down and lands silently on the surface. Throughout the entire scene, ethereal aurora borealis ribbons dance across the star-filled sky, casting shimmering curtains of green, blue, and purple light that bathe the lunar landscape in an otherworldly, magical glow.",
427
  4,
428
- default_negative_prompt,
429
- MAX_DURATION,
430
- 1,
431
- 1,
432
- 42,
433
- False,
434
- "CROP SOURCE",
435
  ],
436
  [
437
  "kill_bill.jpeg",
438
  "Uma Thurman's character, Beatrix Kiddo, holds her razor-sharp katana blade steady in the cinematic lighting. Suddenly, the polished steel begins to soften and distort, like heated metal starting to lose its structural integrity. The blade's perfect edge slowly warps and droops, molten steel beginning to flow downward in silvery rivulets while maintaining its metallic sheen. The transformation starts subtly at first - a slight bend in the blade - then accelerates as the metal becomes increasingly fluid. The camera holds steady on her face as her piercing eyes gradually narrow, not with lethal focus, but with confusion and growing alarm as she watches her weapon dissolve before her eyes. Her breathing quickens slightly as she witnesses this impossible transformation. The melting intensifies, the katana's perfect form becoming increasingly abstract, dripping like liquid mercury from her grip. Molten droplets fall to the ground with soft metallic impacts. Her expression shifts from calm readiness to bewilderment and concern as her legendary instrument of vengeance literally liquefies in her hands, leaving her defenseless and disoriented.",
439
  6,
440
- default_negative_prompt,
441
- MAX_DURATION,
442
- 1,
443
- 1,
444
- 42,
445
- False,
446
- "CROP SOURCE",
447
  ],
448
  ],
449
  inputs=ui_inputs, outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
450
  )
451
 
452
  if __name__ == "__main__":
453
- demo.queue().launch()
 
1
+ # PyTorch 2.8 (hack)
2
  import os
3
+ os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9"')
4
 
5
  # Actual demo code
6
  import spaces
 
33
 
34
  DEFAULT_STEPS = 5
35
 
36
+
37
  pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
38
  transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
39
  subfolder='transformer',
 
56
  OPTIMIZE_WIDTH = 832
57
  OPTIMIZE_HEIGHT = 624
58
 
59
+ optimize_pipeline_(pipe,
60
+ image=Image.new('RGB', (OPTIMIZE_WIDTH, OPTIMIZE_HEIGHT)),
61
+ prompt='prompt',
62
+ height=OPTIMIZE_HEIGHT,
63
+ width=OPTIMIZE_WIDTH,
64
+ num_frames=MAX_FRAMES_MODEL,
65
+ )
 
 
 
 
 
 
 
 
 
 
66
 
67
 
68
  default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
69
  default_negative_prompt = "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, 整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, 画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, 静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
70
 
 
71
  def resize_image(image: Image.Image, use_crop: bool = True) -> Image.Image:
72
  if image.height > image.width:
73
  transposed = image.transpose(Image.Transpose.ROTATE_90)
 
241
  # Convert to RGB for Gradio
242
  return preview.convert('RGB')
243
 
244
+
245
  def get_duration(
246
  input_image,
247
  prompt,
 
306
  gr.Error: If input_image is None (no image uploaded).
307
 
308
  Note:
 
309
  - Frame count is calculated as duration_seconds * FIXED_FPS (24)
310
  - Output dimensions are adjusted to be multiples of MOD_VALUE (32)
311
  - The function uses GPU acceleration via the @spaces.GPU decorator
 
314
  if input_image is None:
315
  raise gr.Error("Please upload an input image.")
316
 
 
 
 
317
  num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
318
  current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
319
  use_crop = resize_mode == "CROP SOURCE"
 
350
  return "(no git info)", "(no git info)"
351
 
352
  latest_hash, latest_msg = get_git_info()
353
+ gr.Markdown("**Git commit:** `{}`\n> {}".format(latest_hash, latest_msg))
354
  gr.Markdown("# Fast 4 steps Wan 2.2 I2V (14B) with Lightning LoRA")
355
  gr.Markdown("run Wan 2.2 in just 4-8 steps, with [Lightning LoRA](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Wan22-Lightning), fp8 quantization & AoT compilation - compatible with 🧨 diffusers and ZeroGPU⚡️")
 
356
  with gr.Row():
357
  with gr.Column():
358
  input_image_component = gr.Image(type="pil", label="Input Image (auto-resized to target H/W)")
359
  resize_mode = gr.Radio(choices=["CROP SOURCE", "RESIZE SOURCE"], value="CROP SOURCE", label="Image Preparation Mode")
360
  cropping_preview_component = gr.Image(type="pil", label="Processing Preview", interactive=False)
361
  prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
362
+ duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5, label="Duration (seconds)", info="Clamped to model's {}-{} frames at {}fps.".format(MIN_FRAMES_MODEL, MAX_FRAMES_MODEL, FIXED_FPS))
363
+
364
  with gr.Accordion("Advanced Settings", open=False):
365
  negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
366
  seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
 
372
  generate_button = gr.Button("Generate Video", variant="primary")
373
  with gr.Column():
374
  video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
375
+
376
  ui_inputs = [
377
  input_image_component, prompt_input, steps_slider,
378
  negative_prompt_input, duration_seconds_input,
 
411
  "wan22_input_2.jpg",
412
  "A sleek lunar vehicle glides into view from left to right, kicking up moon dust as astronauts in white spacesuits hop aboard with characteristic lunar bouncing movements. In the distant background, a VTOL craft descends straight down and lands silently on the surface. Throughout the entire scene, ethereal aurora borealis ribbons dance across the star-filled sky, casting shimmering curtains of green, blue, and purple light that bathe the lunar landscape in an otherworldly, magical glow.",
413
  4,
 
 
 
 
 
 
 
414
  ],
415
  [
416
  "kill_bill.jpeg",
417
  "Uma Thurman's character, Beatrix Kiddo, holds her razor-sharp katana blade steady in the cinematic lighting. Suddenly, the polished steel begins to soften and distort, like heated metal starting to lose its structural integrity. The blade's perfect edge slowly warps and droops, molten steel beginning to flow downward in silvery rivulets while maintaining its metallic sheen. The transformation starts subtly at first - a slight bend in the blade - then accelerates as the metal becomes increasingly fluid. The camera holds steady on her face as her piercing eyes gradually narrow, not with lethal focus, but with confusion and growing alarm as she watches her weapon dissolve before her eyes. Her breathing quickens slightly as she witnesses this impossible transformation. The melting intensifies, the katana's perfect form becoming increasingly abstract, dripping like liquid mercury from her grip. Molten droplets fall to the ground with soft metallic impacts. Her expression shifts from calm readiness to bewilderment and concern as her legendary instrument of vengeance literally liquefies in her hands, leaving her defenseless and disoriented.",
418
  6,
 
 
 
 
 
 
 
419
  ],
420
  ],
421
  inputs=ui_inputs, outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
422
  )
423
 
424
  if __name__ == "__main__":
425
+ demo.queue().launch(mcp_server=True)
optimization.py CHANGED
@@ -14,18 +14,21 @@ from torchao.quantization import Int8WeightOnlyConfig
14
 
15
  from optimization_utils import capture_component_call
16
  from optimization_utils import aoti_compile
17
- from optimization_utils import ZeroGPUCompiledModel
18
  from optimization_utils import drain_module_parameters
19
 
20
 
21
  P = ParamSpec('P')
22
 
 
23
 
24
- TRANSFORMER_NUM_FRAMES_DIM = torch.export.Dim('num_frames', min=3, max=21)
 
25
 
26
  TRANSFORMER_DYNAMIC_SHAPES = {
27
  'hidden_states': {
28
- 2: TRANSFORMER_NUM_FRAMES_DIM,
 
 
29
  },
30
  }
31
 
@@ -44,6 +47,7 @@ def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kw
44
  @spaces.GPU(duration=1500)
45
  def compile_transformer():
46
 
 
47
  pipeline.load_lora_weights(
48
  "Kijai/WanVideo_comfy",
49
  weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
@@ -70,61 +74,33 @@ def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kw
70
  quantize_(pipeline.transformer, Float8DynamicActivationFloat8WeightConfig())
71
  quantize_(pipeline.transformer_2, Float8DynamicActivationFloat8WeightConfig())
72
 
73
- hidden_states: torch.Tensor = call.kwargs['hidden_states']
74
- hidden_states_transposed = hidden_states.transpose(-1, -2).contiguous()
75
- if hidden_states.shape[-1] > hidden_states.shape[-2]:
76
- hidden_states_landscape = hidden_states
77
- hidden_states_portrait = hidden_states_transposed
78
- else:
79
- hidden_states_landscape = hidden_states_transposed
80
- hidden_states_portrait = hidden_states
81
-
82
- exported_landscape_1 = torch.export.export(
83
  mod=pipeline.transformer,
84
  args=call.args,
85
- kwargs=call.kwargs | {'hidden_states': hidden_states_landscape},
86
  dynamic_shapes=dynamic_shapes,
87
  )
88
 
89
- exported_portrait_2 = torch.export.export(
90
  mod=pipeline.transformer_2,
91
  args=call.args,
92
- kwargs=call.kwargs | {'hidden_states': hidden_states_portrait},
93
  dynamic_shapes=dynamic_shapes,
94
  )
95
 
96
- compiled_landscape_1 = aoti_compile(exported_landscape_1, INDUCTOR_CONFIGS)
97
- compiled_portrait_2 = aoti_compile(exported_portrait_2, INDUCTOR_CONFIGS)
 
 
98
 
99
- compiled_landscape_2 = ZeroGPUCompiledModel(compiled_landscape_1.archive_file, compiled_portrait_2.weights)
100
- compiled_portrait_1 = ZeroGPUCompiledModel(compiled_portrait_2.archive_file, compiled_landscape_1.weights)
101
-
102
- return (
103
- compiled_landscape_1,
104
- compiled_landscape_2,
105
- compiled_portrait_1,
106
- compiled_portrait_2,
107
- )
108
 
109
  quantize_(pipeline.text_encoder, Int8WeightOnlyConfig())
110
- cl1, cl2, cp1, cp2 = compile_transformer()
111
-
112
- def combined_transformer_1(*args, **kwargs):
113
- hidden_states: torch.Tensor = kwargs['hidden_states']
114
- if hidden_states.shape[-1] > hidden_states.shape[-2]:
115
- return cl1(*args, **kwargs)
116
- else:
117
- return cp1(*args, **kwargs)
118
-
119
- def combined_transformer_2(*args, **kwargs):
120
- hidden_states: torch.Tensor = kwargs['hidden_states']
121
- if hidden_states.shape[-1] > hidden_states.shape[-2]:
122
- return cl2(*args, **kwargs)
123
- else:
124
- return cp2(*args, **kwargs)
125
-
126
- pipeline.transformer.forward = combined_transformer_1
127
  drain_module_parameters(pipeline.transformer)
128
 
129
- pipeline.transformer_2.forward = combined_transformer_2
130
  drain_module_parameters(pipeline.transformer_2)
 
14
 
15
  from optimization_utils import capture_component_call
16
  from optimization_utils import aoti_compile
 
17
  from optimization_utils import drain_module_parameters
18
 
19
 
20
  P = ParamSpec('P')
21
 
22
+ LATENT_FRAMES_DIM = torch.export.Dim('num_latent_frames', min=8, max=81)
23
 
24
+ LATENT_PATCHED_HEIGHT_DIM = torch.export.Dim('latent_patched_height', min=30, max=52)
25
+ LATENT_PATCHED_WIDTH_DIM = torch.export.Dim('latent_patched_width', min=30, max=52)
26
 
27
  TRANSFORMER_DYNAMIC_SHAPES = {
28
  'hidden_states': {
29
+ 2: LATENT_FRAMES_DIM,
30
+ 3: 2 * LATENT_PATCHED_HEIGHT_DIM,
31
+ 4: 2 * LATENT_PATCHED_WIDTH_DIM,
32
  },
33
  }
34
 
 
47
  @spaces.GPU(duration=1500)
48
  def compile_transformer():
49
 
50
+ # This LoRA fusion part remains the same
51
  pipeline.load_lora_weights(
52
  "Kijai/WanVideo_comfy",
53
  weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
 
74
  quantize_(pipeline.transformer, Float8DynamicActivationFloat8WeightConfig())
75
  quantize_(pipeline.transformer_2, Float8DynamicActivationFloat8WeightConfig())
76
 
77
+
78
+ exported_1 = torch.export.export(
 
 
 
 
 
 
 
 
79
  mod=pipeline.transformer,
80
  args=call.args,
81
+ kwargs=call.kwargs,
82
  dynamic_shapes=dynamic_shapes,
83
  )
84
 
85
+ exported_2 = torch.export.export(
86
  mod=pipeline.transformer_2,
87
  args=call.args,
88
+ kwargs=call.kwargs,
89
  dynamic_shapes=dynamic_shapes,
90
  )
91
 
92
+ compiled_1 = aoti_compile(exported_1, INDUCTOR_CONFIGS)
93
+ compiled_2 = aoti_compile(exported_2, INDUCTOR_CONFIGS)
94
+
95
+ return compiled_1, compiled_2
96
 
 
 
 
 
 
 
 
 
 
97
 
98
  quantize_(pipeline.text_encoder, Int8WeightOnlyConfig())
99
+
100
+ compiled_transformer_1, compiled_transformer_2 = compile_transformer()
101
+
102
+ pipeline.transformer.forward = compiled_transformer_1
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  drain_module_parameters(pipeline.transformer)
104
 
105
+ pipeline.transformer_2.forward = compiled_transformer_2
106
  drain_module_parameters(pipeline.transformer_2)
requirements.txt CHANGED
@@ -8,4 +8,4 @@ peft
8
  ftfy
9
  imageio-ffmpeg
10
  opencv-python
11
- torchao==0.11.0
 
8
  ftfy
9
  imageio-ffmpeg
10
  opencv-python
11
+ torchao==0.11.0