jbilcke-hf
/

HunyuanVideo-HFIE

@@ -8,13 +8,11 @@ import traceback
 import torch
 # note: there is no HunyuanImageToVideoPipeline yet in Diffusers
-from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
 from diffusers.hooks import apply_enhance_a_video, EnhanceAVideoConfig
 from varnish import Varnish
 from varnish.utils import is_truthy, process_input_image
-from teacache import enable_teacache, disable_teacache
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -52,12 +50,12 @@ class GenerationConfig:
     audio_negative_prompt: str = "voices, voice, talking, speaking, speech"
     # TeaCache settings
-    enable_teacache: bool = True
     teacache_threshold: float = 0.15 # values: 0 (original), 0.1 (1.6x speedup), 0.15 (2.1x speedup)
     # Enhance-A-Video settings
-    enable_enhance_a_video: bool = True
     enhance_a_video_weight: float = 5.0
     # LoRA settings
@@ -95,7 +93,7 @@ class EndpointHandler:
             subfolder="transformer",
             torch_dtype=torch.bfloat16
         )
         if support_image_prompt:
             raise Exception("Please use a version of Diffusers that supports HunyuanImageToVideoPipeline")
             # # Initialize image-to-video pipeline
@@ -124,6 +122,21 @@ class EndpointHandler:
             self.text_to_video.transformer = self.text_to_video.transformer.to(torch.bfloat16)
             self.text_to_video.vae = self.text_to_video.vae.half()
         # Initialize LoRA tracking
         self._current_lora_model = None
@@ -309,7 +322,6 @@ class EndpointHandler:
                 # Check if image-to-video generation is requested
                 if support_image_prompt and input_image:
-                    self._configure_teacache(self.image_to_video, config)
                     processed_image = process_input_image(
                         input_image,
                         config.width,
@@ -326,8 +338,6 @@ class EndpointHandler:
                     frames = self.image_to_video(**generation_kwargs).frames
                 else:
-                    self._configure_teacache(self.text_to_video, config)
                     apply_enhance_a_video(self.text_to_video.transformer, EnhanceAVideoConfig(
                         weight=config.enhance_a_video_weight if config.enable_enhance_a_video else 0.0,
                         num_frames_callback=lambda: (config.num_frames - 1),

 import torch
 # note: there is no HunyuanImageToVideoPipeline yet in Diffusers
+from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel, FasterCacheConfig
 from diffusers.hooks import apply_enhance_a_video, EnhanceAVideoConfig
 from varnish import Varnish
 from varnish.utils import is_truthy, process_input_image
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
     audio_negative_prompt: str = "voices, voice, talking, speaking, speech"
     # TeaCache settings
+    enable_teacache: bool = False
     teacache_threshold: float = 0.15 # values: 0 (original), 0.1 (1.6x speedup), 0.15 (2.1x speedup)
     # Enhance-A-Video settings
+    enable_enhance_a_video: bool = False
     enhance_a_video_weight: float = 5.0
     # LoRA settings
             subfolder="transformer",
             torch_dtype=torch.bfloat16
         )
         if support_image_prompt:
             raise Exception("Please use a version of Diffusers that supports HunyuanImageToVideoPipeline")
             # # Initialize image-to-video pipeline
             self.text_to_video.transformer = self.text_to_video.transformer.to(torch.bfloat16)
             self.text_to_video.vae = self.text_to_video.vae.half()
+            # enable FasterCache
+            # those values are coming from here:
+            # https://github.com/huggingface/diffusers/pull/10163/files#diff-777f4ee62cb325371233a450e0f6cc0ba357a3fade2ec2dea912260b4f8d08ceR67-R74
+            faster_cache_config = FasterCacheConfig(
+                spatial_attention_block_skip_range=2,
+                spatial_attention_timestep_skip_range=(-1, 901),
+                unconditional_batch_skip_range=2,
+                attention_weight_callback=lambda _: 0.5,
+                is_guidance_distilled=True,
+            )
+            self.text_to_video.transformer.enable_cache(config)
         # Initialize LoRA tracking
         self._current_lora_model = None
                 # Check if image-to-video generation is requested
                 if support_image_prompt and input_image:
                     processed_image = process_input_image(
                         input_image,
                         config.width,
                     frames = self.image_to_video(**generation_kwargs).frames
                 else:
                     apply_enhance_a_video(self.text_to_video.transformer, EnhanceAVideoConfig(
                         weight=config.enhance_a_video_weight if config.enable_enhance_a_video else 0.0,
                         num_frames_callback=lambda: (config.num_frames - 1),