Spaces:

DawnC
/

VividFlow

Running on Zero

App Files Files Community

DawnC commited on Dec 28, 2025

Commit

c56db8d

verified ·

1 Parent(s): 20e73d1

Upload 2 files

Browse files

Files changed (2) hide show

FlowFacade.py +7 -8
VideoEngine_optimized.py +355 -0

FlowFacade.py CHANGED Viewed

@@ -3,7 +3,7 @@ import torch
 import numpy as np
 from PIL import Image
 from typing import Tuple, Optional
-from VideoEngine import VideoEngine
 from TextProcessor import TextProcessor
 try:
@@ -29,7 +29,7 @@ class FlowFacade:
     def _calculate_gpu_duration(self, image: Image.Image, duration_seconds: float,
                                 num_inference_steps: int, enable_prompt_expansion: bool, **kwargs) -> int:
         BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624
-        BASE_STEP_DURATION = 20  # Sequential CPU offload (conservative estimate)
         resized_image = self.video_engine.resize_image(image)
         width, height = resized_image.width, resized_image.height
@@ -39,16 +39,15 @@ class FlowFacade:
         step_duration = BASE_STEP_DURATION * factor ** 1.5
         total_duration = int(num_inference_steps) * step_duration
-        # Add overhead for first-time model loading (CPU LoRA fusion)
         if not self.video_engine.is_loaded:
-            total_duration += 90  # ~90s for CPU LoRA fusion
         if enable_prompt_expansion:
-            total_duration += 60
-        # Conservative minimum: 300 seconds (5 minutes)
-        # No more NVML errors! Just need enough time for sequential offload
-        return max(int(total_duration), 300)
     @spaces.GPU(duration=_calculate_gpu_duration)
     def generate_video_from_image(self, image: Image.Image, user_instruction: str,

 import numpy as np
 from PIL import Image
 from typing import Tuple, Optional
+from VideoEngine_optimized import VideoEngine
 from TextProcessor import TextProcessor
 try:
     def _calculate_gpu_duration(self, image: Image.Image, duration_seconds: float,
                                 num_inference_steps: int, enable_prompt_expansion: bool, **kwargs) -> int:
         BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624
+        BASE_STEP_DURATION = 8  # FP8 + AOTI optimized (fast direct GPU)
         resized_image = self.video_engine.resize_image(image)
         width, height = resized_image.width, resized_image.height
         step_duration = BASE_STEP_DURATION * factor ** 1.5
         total_duration = int(num_inference_steps) * step_duration
+        # Add overhead for first-time model loading (FP8 quantization + AOTI)
         if not self.video_engine.is_loaded:
+            total_duration += 60  # ~60s for FP8 quantization and AOTI loading
         if enable_prompt_expansion:
+            total_duration += 40
+        # Optimized minimum: 90 seconds (FP8 + AOTI is much faster)
+        return max(int(total_duration), 90)
     @spaces.GPU(duration=_calculate_gpu_duration)
     def generate_video_from_image(self, image: Image.Image, user_instruction: str,

VideoEngine_optimized.py ADDED Viewed

	@@ -0,0 +1,355 @@

+"""
+DeltaFlow - Video Engine (FP8 + AOTI Optimized)
+Ultra-fast Image-to-Video generation using Wan2.2-I2V-A14B
+Features: Lightning LoRA + FP8 Quantization + AOTI Compilation
+~30-40s inference (vs 150s baseline)
+"""
+import warnings
+warnings.filterwarnings('ignore', category=FutureWarning)
+warnings.filterwarnings('ignore', category=DeprecationWarning)
+import gc
+import os
+import tempfile
+import traceback
+from typing import Optional
+import torch
+import numpy as np
+from PIL import Image
+# Critical dependencies
+import ftfy
+import sentencepiece
+# Diffusers imports
+from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
+from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
+from diffusers.utils.export_utils import export_to_video
+class VideoEngine:
+    """
+    Ultra-fast video generation with FP8 quantization and AOTI compilation.
+    30-40s inference time (compared to 150s baseline).
+    """
+    MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
+    TRANSFORMER_REPO = "cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers"
+    LORA_REPO = "Kijai/WanVideo_comfy"
+    LORA_WEIGHT = "Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors"
+    # Model parameters
+    MAX_DIM = 832
+    MIN_DIM = 480
+    SQUARE_DIM = 640
+    MULTIPLE_OF = 16
+    FIXED_FPS = 16
+    MIN_FRAMES = 8
+    MAX_FRAMES = 81
+    def __init__(self):
+        """Initialize VideoEngine."""
+        self.is_spaces = os.environ.get('SPACE_ID') is not None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.pipeline: Optional[WanImageToVideoPipeline] = None
+        self.is_loaded = False
+        self.use_aoti = False
+        print(f"✓ VideoEngine initialized ({self.device})")
+    def _check_xformers_available(self) -> bool:
+        """Check if xFormers is available."""
+        try:
+            import xformers
+            return True
+        except ImportError:
+            return False
+    def load_model(self) -> None:
+        """Load model with FP8 quantization and AOTI compilation."""
+        if self.is_loaded:
+            print("⚠ VideoEngine already loaded")
+            return
+        try:
+            print("=" * 60)
+            print("Loading Wan2.2 I2V Engine with FP8 + AOTI")
+            print("=" * 60)
+            # Stage 1: Load base pipeline to CPU
+            print("→ [1/5] Loading base pipeline to CPU...")
+            self.pipeline = WanImageToVideoPipeline.from_pretrained(
+                self.MODEL_ID,
+                transformer=WanTransformer3DModel.from_pretrained(
+                    self.TRANSFORMER_REPO,
+                    subfolder='transformer',
+                    torch_dtype=torch.bfloat16,
+                ),
+                transformer_2=WanTransformer3DModel.from_pretrained(
+                    self.TRANSFORMER_REPO,
+                    subfolder='transformer_2',
+                    torch_dtype=torch.bfloat16,
+                ),
+                torch_dtype=torch.bfloat16,
+            )
+            print("✓ Base pipeline loaded to CPU")
+            # Stage 2: Load and fuse Lightning LoRA
+            print("→ [2/5] Loading Lightning LoRA...")
+            self.pipeline.load_lora_weights(
+                self.LORA_REPO, weight_name=self.LORA_WEIGHT,
+                adapter_name="lightx2v"
+            )
+            kwargs_lora = {"load_into_transformer_2": True}
+            self.pipeline.load_lora_weights(
+                self.LORA_REPO, weight_name=self.LORA_WEIGHT,
+                adapter_name="lightx2v_2", **kwargs_lora
+            )
+            self.pipeline.set_adapters(
+                ["lightx2v", "lightx2v_2"],
+                adapter_weights=[1., 1.]
+            )
+            self.pipeline.fuse_lora(
+                adapter_names=["lightx2v"], lora_scale=3.,
+                components=["transformer"]
+            )
+            self.pipeline.fuse_lora(
+                adapter_names=["lightx2v_2"], lora_scale=1.,
+                components=["transformer_2"]
+            )
+            self.pipeline.unload_lora_weights()
+            print("✓ Lightning LoRA fused")
+            # Stage 3: FP8 Quantization
+            print("→ [3/5] Applying FP8 quantization...")
+            try:
+                from torchao.quantization import quantize_
+                from torchao.quantization import (
+                    Float8DynamicActivationFloat8WeightConfig,
+                    Int8WeightOnlyConfig
+                )
+                # Quantize text encoder (INT8)
+                quantize_(self.pipeline.text_encoder, Int8WeightOnlyConfig())
+                # Quantize transformers (FP8)
+                quantize_(
+                    self.pipeline.transformer,
+                    Float8DynamicActivationFloat8WeightConfig()
+                )
+                quantize_(
+                    self.pipeline.transformer_2,
+                    Float8DynamicActivationFloat8WeightConfig()
+                )
+                print("✓ FP8 quantization applied (50% memory reduction)")
+            except Exception as e:
+                print(f"⚠ Quantization failed: {e}")
+                raise RuntimeError("FP8 quantization required for this optimized version")
+            # Stage 4: Load AOTI blocks
+            print("→ [4/5] Loading AOTI blocks...")
+            try:
+                import aoti
+                aoti.aoti_blocks_load(
+                    self.pipeline.transformer,
+                    'zerogpu-aoti/Wan2',
+                    variant='fp8da'
+                )
+                aoti.aoti_blocks_load(
+                    self.pipeline.transformer_2,
+                    'zerogpu-aoti/Wan2',
+                    variant='fp8da'
+                )
+                print("✓ AOTI blocks loaded (1.5-1.8x speedup)")
+                self.use_aoti = True
+            except Exception as e:
+                print(f"⚠ AOTI loading failed: {e}")
+                print("  Continuing without AOTI (FP8 only)")
+                self.use_aoti = False
+            # Stage 5: Move to GPU and enable optimizations
+            print("→ [5/5] Moving to GPU...")
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            self.pipeline = self.pipeline.to('cuda')
+            # Enable VAE optimizations
+            self.pipeline.enable_vae_tiling()
+            self.pipeline.enable_vae_slicing()
+            # Enable TF32
+            if torch.cuda.is_available():
+                torch.backends.cuda.matmul.allow_tf32 = True
+                torch.backends.cudnn.allow_tf32 = True
+            # Enable xFormers
+            try:
+                if self._check_xformers_available():
+                    self.pipeline.enable_xformers_memory_efficient_attention()
+                    print("  • xFormers enabled")
+            except:
+                pass
+            self.is_loaded = True
+            mode = "FP8 + AOTI" if self.use_aoti else "FP8 only"
+            print("=" * 60)
+            print(f"✓ VideoEngine Ready - {mode}")
+            print(f"  • Device: {self.device}")
+            print(f"  • Quantization: FP8 (50% memory reduction)")
+            print(f"  • AOTI: {'Enabled (1.5-1.8x speedup)' if self.use_aoti else 'Disabled'}")
+            print(f"  • Expected inference: {'~30-40s' if self.use_aoti else '~60-70s'}")
+            print("=" * 60)
+        except Exception as e:
+            print(f"\n{'='*60}")
+            print("✗ FATAL ERROR LOADING VIDEO ENGINE")
+            print(f"{'='*60}")
+            print(f"Error Type: {type(e).__name__}")
+            print(f"Error Message: {str(e)}")
+            print(f"\nFull Traceback:")
+            print(traceback.format_exc())
+            print(f"{'='*60}")
+            raise
+    def resize_image(self, image: Image.Image) -> Image.Image:
+        """Resize image to fit model constraints while preserving aspect ratio."""
+        width, height = image.size
+        if width == height:
+            return image.resize((self.SQUARE_DIM, self.SQUARE_DIM), Image.LANCZOS)
+        aspect_ratio = width / height
+        MAX_ASPECT_RATIO = self.MAX_DIM / self.MIN_DIM
+        MIN_ASPECT_RATIO = self.MIN_DIM / self.MAX_DIM
+        image_to_resize = image
+        if aspect_ratio > MAX_ASPECT_RATIO:
+            target_w, target_h = self.MAX_DIM, self.MIN_DIM
+            crop_width = int(round(height * MAX_ASPECT_RATIO))
+            left = (width - crop_width) // 2
+            image_to_resize = image.crop((left, 0, left + crop_width, height))
+        elif aspect_ratio < MIN_ASPECT_RATIO:
+            target_w, target_h = self.MIN_DIM, self.MAX_DIM
+            crop_height = int(round(width / MIN_ASPECT_RATIO))
+            top = (height - crop_height) // 2
+            image_to_resize = image.crop((0, top, width, top + crop_height))
+        else:
+            if width > height:
+                target_w = self.MAX_DIM
+                target_h = int(round(target_w / aspect_ratio))
+            else:
+                target_h = self.MAX_DIM
+                target_w = int(round(target_h * aspect_ratio))
+        final_w = round(target_w / self.MULTIPLE_OF) * self.MULTIPLE_OF
+        final_h = round(target_h / self.MULTIPLE_OF) * self.MULTIPLE_OF
+        final_w = max(self.MIN_DIM, min(self.MAX_DIM, final_w))
+        final_h = max(self.MIN_DIM, min(self.MAX_DIM, final_h))
+        return image_to_resize.resize((final_w, final_h), Image.LANCZOS)
+    def get_num_frames(self, duration_seconds: float) -> int:
+        """Calculate frame count from duration."""
+        return 1 + int(np.clip(
+            int(round(duration_seconds * self.FIXED_FPS)),
+            self.MIN_FRAMES,
+            self.MAX_FRAMES,
+        ))
+    def generate_video(
+        self,
+        image: Image.Image,
+        prompt: str,
+        duration_seconds: float = 3.0,
+        num_inference_steps: int = 4,
+        guidance_scale: float = 1.0,
+        guidance_scale_2: float = 1.0,
+        seed: int = 42,
+    ) -> str:
+        """Generate video from image with FP8 + AOTI optimization."""
+        if not self.is_loaded:
+            raise RuntimeError("VideoEngine not loaded. Call load_model() first.")
+        try:
+            resized_image = self.resize_image(image)
+            num_frames = self.get_num_frames(duration_seconds)
+            print(f"\n→ Generating video:")
+            print(f"  • Prompt: {prompt}")
+            print(f"  • Resolution: {resized_image.width}x{resized_image.height}")
+            print(f"  • Frames: {num_frames} ({duration_seconds}s @ {self.FIXED_FPS}fps)")
+            print(f"  • Steps: {num_inference_steps}")
+            print(f"  • Mode: {'FP8 + AOTI' if self.use_aoti else 'FP8 only'}")
+            # Memory cleanup
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()
+            with torch.no_grad():
+                # Use CUDA generator for optimized version
+                generator = torch.Generator(device="cuda").manual_seed(seed)
+                output_frames = self.pipeline(
+                    image=resized_image,
+                    prompt=prompt,
+                    height=resized_image.height,
+                    width=resized_image.width,
+                    num_frames=num_frames,
+                    guidance_scale=float(guidance_scale),
+                    guidance_scale_2=float(guidance_scale_2),
+                    num_inference_steps=int(num_inference_steps),
+                    generator=generator,
+                ).frames[0]
+            # Cleanup after generation
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            # Export video
+            temp_dir = tempfile.gettempdir()
+            output_path = os.path.join(temp_dir, f"deltaflow_{seed}.mp4")
+            export_to_video(output_frames, output_path, fps=self.FIXED_FPS)
+            print(f"✓ Video generated: {output_path}")
+            return output_path
+        except Exception as e:
+            print(f"\n{'='*60}")
+            print("✗ FATAL ERROR DURING VIDEO GENERATION")
+            print(f"{'='*60}")
+            print(f"Error Type: {type(e).__name__}")
+            print(f"Error Message: {str(e)}")
+            print(f"\nFull Traceback:")
+            print(traceback.format_exc())
+            print(f"{'='*60}")
+            raise
+    def unload_model(self) -> None:
+        """Unload pipeline and free memory."""
+        if not self.is_loaded:
+            return
+        try:
+            if self.pipeline is not None:
+                del self.pipeline
+                self.pipeline = None
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            self.is_loaded = False
+            print("✓ VideoEngine unloaded")
+        except Exception as e:
+            print(f"⚠ Error during unload: {str(e)}")