Wan22_14B_Lightning_14b_I2V_UI

Running on Zero

App Files Files Community

EdBanshee commited on Sep 17, 2025

Commit

db19b41

1 Parent(s): 5c8c602

Too many issues, wipe with known working I2V code and re-added the frame sizing only

Browse files

Files changed (3) hide show

app.py +16 -44
optimization.py +21 -45
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
-# PyTorch 2.8 (temporary hack)
 import os
-os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9" spaces')
 # Actual demo code
 import spaces
@@ -33,6 +33,7 @@ MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1)
 DEFAULT_STEPS = 5
 pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
     transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
         subfolder='transformer',
@@ -55,29 +56,18 @@ for i in range(3):
 OPTIMIZE_WIDTH = 832
 OPTIMIZE_HEIGHT = 624
-# Optimize pipeline at startup (before ZeroGPU packing)
-print("🚀 Optimizing pipeline at startup...")
-try:
-    optimize_pipeline_(pipe,
-        image=Image.new('RGB', (OPTIMIZE_WIDTH, OPTIMIZE_HEIGHT)),
-        prompt='prompt',
-        height=OPTIMIZE_HEIGHT,
-        width=OPTIMIZE_WIDTH,
-        num_frames=MAX_FRAMES_MODEL,
-    )
-    pipeline_optimized = True
-    print("✅ Pipeline optimization completed successfully at startup!")
-except Exception as e:
-    pipeline_optimized = False
-    print("❌ Pipeline optimization failed:", str(e))
-    raise e
 default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
 default_negative_prompt = "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, 整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, 画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, 静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
 def resize_image(image: Image.Image, use_crop: bool = True) -> Image.Image:
     if image.height > image.width:
         transposed = image.transpose(Image.Transpose.ROTATE_90)
@@ -251,6 +241,7 @@ def make_cropping_preview(input_image: Image.Image, use_crop: bool = True) -> Im
     # Convert to RGB for Gradio
     return preview.convert('RGB')
 def get_duration(
     input_image,
     prompt,
@@ -315,7 +306,6 @@ def generate_video(
         gr.Error: If input_image is None (no image uploaded).
     Note:
-        - The function automatically resizes the input image to the target dimensions
         - Frame count is calculated as duration_seconds * FIXED_FPS (24)
         - Output dimensions are adjusted to be multiples of MOD_VALUE (32)
         - The function uses GPU acceleration via the @spaces.GPU decorator
@@ -324,9 +314,6 @@ def generate_video(
     if input_image is None:
         raise gr.Error("Please upload an input image.")
-    # Pipeline was optimized at startup, proceed with generation
-    progress(0.1, "Starting video generation...")
     num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
     current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
     use_crop = resize_mode == "CROP SOURCE"
@@ -363,18 +350,17 @@ with gr.Blocks() as demo:
             return "(no git info)", "(no git info)"
     latest_hash, latest_msg = get_git_info()
-    gr.Markdown(f"**Git commit:** `{latest_hash}`\n> {latest_msg}")
     gr.Markdown("# Fast 4 steps Wan 2.2 I2V (14B) with Lightning LoRA")
     gr.Markdown("run Wan 2.2 in just 4-8 steps, with [Lightning LoRA](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Wan22-Lightning), fp8 quantization & AoT compilation - compatible with 🧨 diffusers and ZeroGPU⚡️")
-    gr.Markdown("🚀 **Pipeline optimized at startup!** Fast video generation ready.")
     with gr.Row():
         with gr.Column():
             input_image_component = gr.Image(type="pil", label="Input Image (auto-resized to target H/W)")
             resize_mode = gr.Radio(choices=["CROP SOURCE", "RESIZE SOURCE"], value="CROP SOURCE", label="Image Preparation Mode")
             cropping_preview_component = gr.Image(type="pil", label="Processing Preview", interactive=False)
             prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
-            duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
                 seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
@@ -386,7 +372,7 @@ with gr.Blocks() as demo:
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
             video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
     ui_inputs = [
         input_image_component, prompt_input, steps_slider,
         negative_prompt_input, duration_seconds_input,
@@ -425,29 +411,15 @@ with gr.Blocks() as demo:
                 "wan22_input_2.jpg",
                 "A sleek lunar vehicle glides into view from left to right, kicking up moon dust as astronauts in white spacesuits hop aboard with characteristic lunar bouncing movements. In the distant background, a VTOL craft descends straight down and lands silently on the surface. Throughout the entire scene, ethereal aurora borealis ribbons dance across the star-filled sky, casting shimmering curtains of green, blue, and purple light that bathe the lunar landscape in an otherworldly, magical glow.",
                 4,
-                default_negative_prompt,
-                MAX_DURATION,
-                1,
-                1,
-                42,
-                False,
-                "CROP SOURCE",
             ],
             [
                 "kill_bill.jpeg",
                 "Uma Thurman's character, Beatrix Kiddo, holds her razor-sharp katana blade steady in the cinematic lighting. Suddenly, the polished steel begins to soften and distort, like heated metal starting to lose its structural integrity. The blade's perfect edge slowly warps and droops, molten steel beginning to flow downward in silvery rivulets while maintaining its metallic sheen. The transformation starts subtly at first - a slight bend in the blade - then accelerates as the metal becomes increasingly fluid. The camera holds steady on her face as her piercing eyes gradually narrow, not with lethal focus, but with confusion and growing alarm as she watches her weapon dissolve before her eyes. Her breathing quickens slightly as she witnesses this impossible transformation. The melting intensifies, the katana's perfect form becoming increasingly abstract, dripping like liquid mercury from her grip. Molten droplets fall to the ground with soft metallic impacts. Her expression shifts from calm readiness to bewilderment and concern as her legendary instrument of vengeance literally liquefies in her hands, leaving her defenseless and disoriented.",
                 6,
-                default_negative_prompt,
-                MAX_DURATION,
-                1,
-                1,
-                42,
-                False,
-                "CROP SOURCE",
             ],
         ],
         inputs=ui_inputs, outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
     )
 if __name__ == "__main__":
-    demo.queue().launch()

+# PyTorch 2.8 (hack)
 import os
+os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9"')
 # Actual demo code
 import spaces
 DEFAULT_STEPS = 5
 pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
     transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
         subfolder='transformer',
 OPTIMIZE_WIDTH = 832
 OPTIMIZE_HEIGHT = 624
+optimize_pipeline_(pipe,
+    image=Image.new('RGB', (OPTIMIZE_WIDTH, OPTIMIZE_HEIGHT)),
+    prompt='prompt',
+    height=OPTIMIZE_HEIGHT,
+    width=OPTIMIZE_WIDTH,
+    num_frames=MAX_FRAMES_MODEL,
+)
 default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
 default_negative_prompt = "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, 整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, 画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, 静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
 def resize_image(image: Image.Image, use_crop: bool = True) -> Image.Image:
     if image.height > image.width:
         transposed = image.transpose(Image.Transpose.ROTATE_90)
     # Convert to RGB for Gradio
     return preview.convert('RGB')
 def get_duration(
     input_image,
     prompt,
         gr.Error: If input_image is None (no image uploaded).
     Note:
         - Frame count is calculated as duration_seconds * FIXED_FPS (24)
         - Output dimensions are adjusted to be multiples of MOD_VALUE (32)
         - The function uses GPU acceleration via the @spaces.GPU decorator
     if input_image is None:
         raise gr.Error("Please upload an input image.")
     num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
     current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
     use_crop = resize_mode == "CROP SOURCE"
             return "(no git info)", "(no git info)"
     latest_hash, latest_msg = get_git_info()
+    gr.Markdown("**Git commit:** `{}`\n> {}".format(latest_hash, latest_msg))
     gr.Markdown("# Fast 4 steps Wan 2.2 I2V (14B) with Lightning LoRA")
     gr.Markdown("run Wan 2.2 in just 4-8 steps, with [Lightning LoRA](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Wan22-Lightning), fp8 quantization & AoT compilation - compatible with 🧨 diffusers and ZeroGPU⚡️")
     with gr.Row():
         with gr.Column():
             input_image_component = gr.Image(type="pil", label="Input Image (auto-resized to target H/W)")
             resize_mode = gr.Radio(choices=["CROP SOURCE", "RESIZE SOURCE"], value="CROP SOURCE", label="Image Preparation Mode")
             cropping_preview_component = gr.Image(type="pil", label="Processing Preview", interactive=False)
             prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
+            duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5, label="Duration (seconds)", info="Clamped to model's {}-{} frames at {}fps.".format(MIN_FRAMES_MODEL, MAX_FRAMES_MODEL, FIXED_FPS))
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
                 seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
             video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
     ui_inputs = [
         input_image_component, prompt_input, steps_slider,
         negative_prompt_input, duration_seconds_input,
                 "wan22_input_2.jpg",
                 "A sleek lunar vehicle glides into view from left to right, kicking up moon dust as astronauts in white spacesuits hop aboard with characteristic lunar bouncing movements. In the distant background, a VTOL craft descends straight down and lands silently on the surface. Throughout the entire scene, ethereal aurora borealis ribbons dance across the star-filled sky, casting shimmering curtains of green, blue, and purple light that bathe the lunar landscape in an otherworldly, magical glow.",
                 4,
             ],
             [
                 "kill_bill.jpeg",
                 "Uma Thurman's character, Beatrix Kiddo, holds her razor-sharp katana blade steady in the cinematic lighting. Suddenly, the polished steel begins to soften and distort, like heated metal starting to lose its structural integrity. The blade's perfect edge slowly warps and droops, molten steel beginning to flow downward in silvery rivulets while maintaining its metallic sheen. The transformation starts subtly at first - a slight bend in the blade - then accelerates as the metal becomes increasingly fluid. The camera holds steady on her face as her piercing eyes gradually narrow, not with lethal focus, but with confusion and growing alarm as she watches her weapon dissolve before her eyes. Her breathing quickens slightly as she witnesses this impossible transformation. The melting intensifies, the katana's perfect form becoming increasingly abstract, dripping like liquid mercury from her grip. Molten droplets fall to the ground with soft metallic impacts. Her expression shifts from calm readiness to bewilderment and concern as her legendary instrument of vengeance literally liquefies in her hands, leaving her defenseless and disoriented.",
                 6,
             ],
         ],
         inputs=ui_inputs, outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
     )
 if __name__ == "__main__":
+    demo.queue().launch(mcp_server=True)

optimization.py CHANGED Viewed

@@ -14,18 +14,21 @@ from torchao.quantization import Int8WeightOnlyConfig
 from optimization_utils import capture_component_call
 from optimization_utils import aoti_compile
-from optimization_utils import ZeroGPUCompiledModel
 from optimization_utils import drain_module_parameters
 P = ParamSpec('P')
-TRANSFORMER_NUM_FRAMES_DIM = torch.export.Dim('num_frames', min=3, max=21)
 TRANSFORMER_DYNAMIC_SHAPES = {
     'hidden_states': {
-        2: TRANSFORMER_NUM_FRAMES_DIM,
     },
 }
@@ -44,6 +47,7 @@ def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kw
     @spaces.GPU(duration=1500)
     def compile_transformer():
         pipeline.load_lora_weights(
             "Kijai/WanVideo_comfy",
             weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
@@ -70,61 +74,33 @@ def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kw
         quantize_(pipeline.transformer, Float8DynamicActivationFloat8WeightConfig())
         quantize_(pipeline.transformer_2, Float8DynamicActivationFloat8WeightConfig())
-        hidden_states: torch.Tensor = call.kwargs['hidden_states']
-        hidden_states_transposed = hidden_states.transpose(-1, -2).contiguous()
-        if hidden_states.shape[-1] > hidden_states.shape[-2]:
-            hidden_states_landscape = hidden_states
-            hidden_states_portrait = hidden_states_transposed
-        else:
-            hidden_states_landscape = hidden_states_transposed
-            hidden_states_portrait = hidden_states
-        exported_landscape_1 = torch.export.export(
             mod=pipeline.transformer,
             args=call.args,
-            kwargs=call.kwargs | {'hidden_states': hidden_states_landscape},
             dynamic_shapes=dynamic_shapes,
         )
-        exported_portrait_2 = torch.export.export(
             mod=pipeline.transformer_2,
             args=call.args,
-            kwargs=call.kwargs | {'hidden_states': hidden_states_portrait},
             dynamic_shapes=dynamic_shapes,
         )
-        compiled_landscape_1 = aoti_compile(exported_landscape_1, INDUCTOR_CONFIGS)
-        compiled_portrait_2 = aoti_compile(exported_portrait_2, INDUCTOR_CONFIGS)
-        compiled_landscape_2 = ZeroGPUCompiledModel(compiled_landscape_1.archive_file, compiled_portrait_2.weights)
-        compiled_portrait_1 = ZeroGPUCompiledModel(compiled_portrait_2.archive_file, compiled_landscape_1.weights)
-        return (
-            compiled_landscape_1,
-            compiled_landscape_2,
-            compiled_portrait_1,
-            compiled_portrait_2,
-        )
     quantize_(pipeline.text_encoder, Int8WeightOnlyConfig())
-    cl1, cl2, cp1, cp2 = compile_transformer()
-    def combined_transformer_1(*args, **kwargs):
-        hidden_states: torch.Tensor = kwargs['hidden_states']
-        if hidden_states.shape[-1] > hidden_states.shape[-2]:
-            return cl1(*args, **kwargs)
-        else:
-            return cp1(*args, **kwargs)
-    def combined_transformer_2(*args, **kwargs):
-        hidden_states: torch.Tensor = kwargs['hidden_states']
-        if hidden_states.shape[-1] > hidden_states.shape[-2]:
-            return cl2(*args, **kwargs)
-        else:
-            return cp2(*args, **kwargs)
-    pipeline.transformer.forward = combined_transformer_1
     drain_module_parameters(pipeline.transformer)
-    pipeline.transformer_2.forward = combined_transformer_2
     drain_module_parameters(pipeline.transformer_2)

 from optimization_utils import capture_component_call
 from optimization_utils import aoti_compile
 from optimization_utils import drain_module_parameters
 P = ParamSpec('P')
+LATENT_FRAMES_DIM = torch.export.Dim('num_latent_frames', min=8, max=81)
+LATENT_PATCHED_HEIGHT_DIM = torch.export.Dim('latent_patched_height', min=30, max=52)
+LATENT_PATCHED_WIDTH_DIM = torch.export.Dim('latent_patched_width', min=30, max=52)
 TRANSFORMER_DYNAMIC_SHAPES = {
     'hidden_states': {
+        2: LATENT_FRAMES_DIM,
+        3: 2 * LATENT_PATCHED_HEIGHT_DIM,
+        4: 2 * LATENT_PATCHED_WIDTH_DIM,
     },
 }
     @spaces.GPU(duration=1500)
     def compile_transformer():
+        # This LoRA fusion part remains the same
         pipeline.load_lora_weights(
             "Kijai/WanVideo_comfy",
             weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
         quantize_(pipeline.transformer, Float8DynamicActivationFloat8WeightConfig())
         quantize_(pipeline.transformer_2, Float8DynamicActivationFloat8WeightConfig())
+        exported_1 = torch.export.export(
             mod=pipeline.transformer,
             args=call.args,
+            kwargs=call.kwargs,
             dynamic_shapes=dynamic_shapes,
         )
+        exported_2 = torch.export.export(
             mod=pipeline.transformer_2,
             args=call.args,
+            kwargs=call.kwargs,
             dynamic_shapes=dynamic_shapes,
         )
+        compiled_1 = aoti_compile(exported_1, INDUCTOR_CONFIGS)
+        compiled_2 = aoti_compile(exported_2, INDUCTOR_CONFIGS)
+        return compiled_1, compiled_2
     quantize_(pipeline.text_encoder, Int8WeightOnlyConfig())
+    compiled_transformer_1, compiled_transformer_2 = compile_transformer()
+    pipeline.transformer.forward = compiled_transformer_1
     drain_module_parameters(pipeline.transformer)
+    pipeline.transformer_2.forward = compiled_transformer_2
     drain_module_parameters(pipeline.transformer_2)

requirements.txt CHANGED Viewed

@@ -8,4 +8,4 @@ peft
 ftfy
 imageio-ffmpeg
 opencv-python
-torchao==0.11.0

 ftfy
 imageio-ffmpeg
 opencv-python
+torchao==0.11.0