jbilcke-hf
/

HunyuanVideo-HFIE

@@ -1,12 +1,14 @@
 from dataclasses import dataclass
 from typing import Dict, Any, Optional
 import base64
 import logging
 import random
 import traceback
 import torch
 from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
 from varnish import Varnish
 from enhance_a_video import enable_enhance, inject_enhance_for_hunyuanvideo, set_enhance_weight
 from teacache import enable_teacache, disable_teacache
@@ -15,6 +17,9 @@ from teacache import enable_teacache, disable_teacache
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 @dataclass
 class GenerationConfig:
     """Configuration for video generation"""
@@ -51,7 +56,12 @@ class GenerationConfig:
     # Enhance-A-Video settings
     enable_enhance_a_video: bool = True
-    enhance_a_video_weight: float = 4.0
     def validate_and_adjust(self) -> 'GenerationConfig':
         """Validate and adjust parameters"""
@@ -83,25 +93,37 @@ class EndpointHandler:
             subfolder="transformer",
             torch_dtype=torch.bfloat16
         )
-        inject_enhance_for_hunyuanvideo(transformer)
-        # Initialize HunyuanVideo pipeline with the enhanced transformer
-        self.pipeline = HunyuanVideoPipeline.from_pretrained(
-            path,
-            transformer=transformer,
-            torch_dtype=torch.float16,
-        ).to(self.device)
-        # Initialize text encoders in float16
-        self.pipeline.text_encoder = self.pipeline.text_encoder.half()
-        self.pipeline.text_encoder_2 = self.pipeline.text_encoder_2.half()
-        # Initialize transformer in bfloat16
-        self.pipeline.transformer = self.pipeline.transformer.to(torch.bfloat16)
-        # Initialize VAE in float16
-        self.pipeline.vae = self.pipeline.vae.half()
         # Initialize Varnish for post-processing
         self.varnish = Varnish(
@@ -109,6 +131,56 @@ class EndpointHandler:
             model_base_dir="/repository/varnish"
         )
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Process video generation requests
@@ -156,7 +228,11 @@ class EndpointHandler:
             teacache_threshold=params.get("teacache_threshold", 0.15),
             enable_enhance_a_video=params.get("enable_enhance_a_video", True),
-            enhance_a_video_weight=params.get("enhance_a_video_weight", 4.0)
         ).validate_and_adjust()
         try:
@@ -178,77 +254,95 @@ class EndpointHandler:
             #else:
             #    disable_teacache(self.pipeline.transformer)
-            # Configure Enhance-A-Video weight if enabled
-            if config.enable_enhance_a_video:
-                set_enhance_weight(config.enhance_a_video_weight)
-                enable_enhance()
-            else:
-                # Reset enhance weight to 0 to effectively disable it
-                set_enhance_weight(0)
-            # Generate video frames
             with torch.inference_mode():
-                output = self.pipeline(
-                    prompt=config.prompt,
                     # Failed to generate video: HunyuanVideoPipeline.__call__() got an unexpected keyword argument 'negative_prompt'
-                    #negative_prompt=config.negative_prompt,
-                    num_frames=config.num_frames,
-                    height=config.height,
-                    width=config.width,
-                    num_inference_steps=config.num_inference_steps,
-                    guidance_scale=config.guidance_scale,
-                    generator=generator,
-                    output_type="pt",
-                ).frames
-                # Process with Varnish
-                import asyncio
                 try:
                     loop = asyncio.get_event_loop()
                 except RuntimeError:
                     loop = asyncio.new_event_loop()
                     asyncio.set_event_loop(loop)
-                result = loop.run_until_complete(
-                    self.varnish(
-                        input_data=output,
-                        fps=config.fps,
-                        double_num_frames=config.double_num_frames,
-                        super_resolution=config.super_resolution,
-                        grain_amount=config.grain_amount,
-                        enable_audio=config.enable_audio,
-                        audio_prompt=config.audio_prompt,
-                        audio_negative_prompt=config.audio_negative_prompt,
-                    )
-                )
-                # Get video data URI
-                video_uri = loop.run_until_complete(
-                    result.write(
-                        type="data-uri",
-                        quality=config.quality
-                    )
-                )
                 return {
                     "video": video_uri,
                     "content-type": "video/mp4",
-                    "metadata": {
-                        "width": result.metadata.width,
-                        "height": result.metadata.height,
-                        "num_frames": result.metadata.frame_count,
-                        "fps": result.metadata.fps,
-                        "duration": result.metadata.duration,
-                        "seed": config.seed,
-                        "enable_teacache": config.enable_teacache,
-                        "teacache_threshold": config.teacache_threshold if config.enable_teacache else 0,
-                        "enable_enhance_a_video": config.enable_enhance_a_video,
-                        "enhance_a_video_weight": config.enhance_a_video_weight if config.enable_enhance_a_video else 0,
-                    }
                 }
         except Exception as e:
             message = f"Error generating video ({str(e)})\n{traceback.format_exc()}"
             logger.error(message)

 from dataclasses import dataclass
 from typing import Dict, Any, Optional
 import base64
+import asyncio
 import logging
 import random
 import traceback
 import torch
 from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
 from varnish import Varnish
+from varnish.utils import is_truthy, process_input_image
 from enhance_a_video import enable_enhance, inject_enhance_for_hunyuanvideo, set_enhance_weight
 from teacache import enable_teacache, disable_teacache
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Check environment variable for pipeline support
+support_image_prompt = is_truthy(os.getenv("SUPPORT_INPUT_IMAGE_PROMPT"))
 @dataclass
 class GenerationConfig:
     """Configuration for video generation"""
     # Enhance-A-Video settings
     enable_enhance_a_video: bool = True
+    enhance_a_video_weight: float = 5.0
+    # LoRA settings
+    lora_model_name: str = ""  # HuggingFace repo ID or path to LoRA model
+    lora_model_weight_file: str = ""  # Specific weight file to load from the LoRA model
+    lora_model_trigger: str = ""  # Optional trigger word to prepend to the prompt
     def validate_and_adjust(self) -> 'GenerationConfig':
         """Validate and adjust parameters"""
             subfolder="transformer",
             torch_dtype=torch.bfloat16
         )
+        if support_image_prompt:
+            # Initialize image-to-video pipeline
+            self.image_to_video = HunyuanImageToVideoPipeline.from_pretrained(
+                path,
+                transformer=transformer,
+                torch_dtype=torch.float16,
+            ).to(self.device)
+            # Initialize components in appropriate precision
+            self.image_to_video.text_encoder = self.image_to_video.text_encoder.half()
+            self.image_to_video.text_encoder_2 = self.image_to_video.text_encoder_2.half()
+            self.image_to_video.transformer = self.image_to_video.transformer.to(torch.bfloat16)
+            self.image_to_video.vae = self.image_to_video.vae.half()
+        else:
+            # Initialize text-to-video pipeline
+            self.text_to_video = HunyuanVideoPipeline.from_pretrained(
+                path,
+                transformer=transformer,
+                torch_dtype=torch.float16,
+            ).to(self.device)
+            # Initialize components in appropriate precision
+            self.text_to_video.text_encoder = self.text_to_video.text_encoder.half()
+            self.text_to_video.text_encoder_2 = self.text_to_video.text_encoder_2.half()
+            self.text_to_video.transformer = self.text_to_video.transformer.to(torch.bfloat16)
+            self.text_to_video.vae = self.text_to_video.vae.half()
+        # Initialize LoRA tracking
+        self._current_lora_model = None
         # Initialize Varnish for post-processing
         self.varnish = Varnish(
             model_base_dir="/repository/varnish"
         )
+    async def process_frames(
+        self,
+        frames: torch.Tensor,
+        config: GenerationConfig
+    ) -> tuple[str, dict]:
+        """Post-process generated frames using Varnish
+        Args:
+            frames: Generated video frames tensor
+            config: Generation configuration
+        Returns:
+            Tuple of (video data URI, metadata dictionary)
+        """
+        try:
+            # Process video with Varnish
+            result = await self.varnish(
+                input_data=frames,
+                fps=config.fps,
+                double_num_frames=config.double_num_frames,
+                super_resolution=config.super_resolution,
+                grain_amount=config.grain_amount,
+                enable_audio=config.enable_audio,
+                audio_prompt=config.audio_prompt,
+                audio_negative_prompt=config.audio_negative_prompt
+            )
+            # Convert to data URI
+            video_uri = await result.write(type="data-uri", quality=config.quality)
+            # Collect metadata
+            metadata = {
+                "width": result.metadata.width,
+                "height": result.metadata.height,
+                "num_frames": result.metadata.frame_count,
+                "fps": result.metadata.fps,
+                "duration": result.metadata.duration,
+                "seed": config.seed,
+                "enable_teacache": config.enable_teacache,
+                "teacache_threshold": config.teacache_threshold if config.enable_teacache else 0,
+                "enable_enhance_a_video": config.enable_enhance_a_video,
+                "enhance_a_video_weight": config.enhance_a_video_weight if config.enable_enhance_a_video else 0,
+            }
+            return video_uri, metadata
+        except Exception as e:
+            logger.error(f"Error in process_frames: {str(e)}")
+            raise RuntimeError(f"Failed to process frames: {str(e)}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Process video generation requests
             teacache_threshold=params.get("teacache_threshold", 0.15),
             enable_enhance_a_video=params.get("enable_enhance_a_video", True),
+            enhance_a_video_weight=params.get("enhance_a_video_weight", 5.0),
+            lora_model_name=params.get("lora_model_name", ""),
+            lora_model_weight_file=params.get("lora_model_weight_file", ""),
+            lora_model_trigger=params.get("lora_model_trigger", ""),
         ).validate_and_adjust()
         try:
             #else:
             #    disable_teacache(self.pipeline.transformer)
             with torch.inference_mode():
+                # Configure Enhance-A-Video weight if enabled
+                if config.enable_enhance_a_video:
+                    set_enhance_weight(config.enhance_a_video_weight)
+                    enable_enhance()
+                else:
+                    # Reset enhance weight to 0 to effectively disable it
+                    set_enhance_weight(0)
+                # Prepare generation parameters
+                generation_kwargs = {
+                    "prompt": config.prompt,
                     # Failed to generate video: HunyuanVideoPipeline.__call__() got an unexpected keyword argument 'negative_prompt'
+                    #"negative_prompt": config.negative_prompt,
+                    "num_frames": config.num_frames,
+                    "height": config.height,
+                    "width": config.width,
+                    "num_inference_steps": config.num_inference_steps,
+                    "guidance_scale": config.guidance_scale,
+                    "generator": generator,
+                    "output_type": "pt",
+                }
+                # Handle LoRA loading/unloading
+                if hasattr(self, '_current_lora_model'):
+                    if self._current_lora_model != (config.lora_model_name, config.lora_model_weight_file):
+                        # Unload previous LoRA if it exists and is different
+                        if support_image_prompt and hasattr(self.image_to_video, 'unload_lora_weights'):
+                            self.image_to_video.unload_lora_weights()
+                        else:
+                            if hasattr(self.text_to_video, 'unload_lora_weights'):
+                                self.text_to_video.unload_lora_weights()
+                if config.lora_model_name:
+                    # Load new LoRA
+                    if support_image_prompt and hasattr(self.image_to_video, 'load_lora_weights'):
+                        self.image_to_video.load_lora_weights(
+                            config.lora_model_name,
+                            weight_name=config.lora_model_weight_file if config.lora_model_weight_file else None,
+                            token=hf_token,
+                        )
+                    else:
+                        if hasattr(self.text_to_video, 'load_lora_weights'):
+                            self.text_to_video.load_lora_weights(
+                                config.lora_model_name,
+                                weight_name=config.lora_model_weight_file if config.lora_model_weight_file else None,
+                                token=hf_token,
+                            )
+                    self._current_lora_model = (config.lora_model_name, config.lora_model_weight_file)
+                # Modify prompt if trigger word is provided
+                if config.lora_model_trigger:
+                    generation_kwargs["prompt"] = f"{config.lora_model_trigger} {generation_kwargs['prompt']}"
+                # Check if image-to-video generation is requested
+                if support_image_prompt and input_image:
+                    self._configure_teacache(self.image_to_video, config)
+                    processed_image = process_input_image(
+                        input_image,
+                        config.width,
+                        config.height,
+                        config.input_image_quality,
+                    )
+                    generation_kwargs["image"] = processed_image
+                    frames = self.image_to_video(**generation_kwargs).frames
+                else:
+                    self._configure_teacache(self.text_to_video, config)
+                    frames = self.text_to_video(**generation_kwargs).frames
                 try:
                     loop = asyncio.get_event_loop()
                 except RuntimeError:
                     loop = asyncio.new_event_loop()
                     asyncio.set_event_loop(loop)
+                video_uri, metadata = loop.run_until_complete(self.process_frames(frames, config))
                 return {
                     "video": video_uri,
                     "content-type": "video/mp4",
+                    "metadata": metadata
                 }
         except Exception as e:
             message = f"Error generating video ({str(e)})\n{traceback.format_exc()}"
             logger.error(message)