File size: 23,361 Bytes

56f2217

import hashlib
import os
from typing import List, Optional, Union

import torch
from diffusers import FluxModularPipeline, ModularPipelineBlocks
from diffusers.loaders import FluxLoraLoaderMixin, TextualInversionLoaderMixin
from diffusers.modular_pipelines import PipelineState
from diffusers.modular_pipelines.modular_pipeline_utils import (
    ComponentSpec,
    InputParam,
    OutputParam,
)
from diffusers.utils import (
    USE_PEFT_BACKEND,
    logger,
    scale_lora_layers,
    unscale_lora_layers,
)
from safetensors import safe_open
from safetensors.torch import save_file
from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast


class CachedFluxTextEncoderStep(ModularPipelineBlocks):
    model_name = "flux"

    def __init__(
        self,
        use_cache: bool = True,
        cache_dir: Optional[str] = None,
        load_from_disk: bool = True,
    ) -> None:
        """Initialize the cached Flux text encoder step.

        Args:
            use_cache: Whether to enable caching of prompt embeddings. Defaults to True.
            cache_dir: Directory to store cache files. If None, uses ~/.cache/flux_prompt_cache.
            load_from_disk: Whether to load existing cache from disk on initialization. Defaults to True.
        """
        super().__init__()
        self.cache = {} if use_cache else None
        if use_cache:
            self.cache_dir = cache_dir or os.path.join(
                os.path.expanduser("~"), ".cache", "flux_prompt_cache"
            )
            os.makedirs(self.cache_dir, exist_ok=True)
        else:
            self.cache_dir = None

        # Load existing cache if requested
        if load_from_disk and use_cache:
            self.load_cache_from_disk()

    @property
    def description(self) -> str:
        return "Text Encoder step that generate text_embeddings to guide the video generation"

    @property
    def expected_components(self):
        return [
            ComponentSpec("text_encoder", CLIPTextModel),
            ComponentSpec("tokenizer", CLIPTokenizer),
            ComponentSpec("text_encoder_2", T5EncoderModel),
            ComponentSpec("tokenizer_2", T5TokenizerFast),
        ]

    @property
    def expected_configs(self):
        return []

    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("prompt"),
            InputParam("prompt_2"),
            InputParam("joint_attention_kwargs"),
        ]

    @property
    def intermediate_outputs(self):
        return [
            OutputParam(
                "prompt_embeds",
                type_hint=torch.Tensor,
                description="text embeddings used to guide the image generation",
            ),
            OutputParam(
                "pooled_prompt_embeds",
                type_hint=torch.Tensor,
                description="pooled text embeddings used to guide the image generation",
            ),
            OutputParam(
                "text_ids",
                type_hint=torch.Tensor,
                description="ids from the text sequence for RoPE",
            ),
        ]

    @staticmethod
    def check_inputs(block_state):
        for prompt in [block_state.prompt, block_state.prompt_2]:
            if prompt is not None and (
                not isinstance(prompt, str) and not isinstance(prompt, list)
            ):
                raise ValueError(
                    f"`prompt` or `prompt_2` has to be of type `str` or `list` but is {type(prompt)}"
                )

    def save_cache_to_disk(self):
        """Save the current cache to disk as a safetensors file."""
        if not self.cache or not self.cache_dir:
            return

        cache_file = os.path.join(self.cache_dir, "cache.safetensors")

        # Prepare tensors dict for safetensors
        tensors_to_save = {}
        for key, tensor in self.cache.items():
            # Ensure tensor is on CPU before saving
            cpu_tensor = (
                tensor.cpu() if tensor.device != torch.device("cpu") else tensor
            )
            tensors_to_save[key] = cpu_tensor

        # Save tensors
        save_file(tensors_to_save, cache_file)
        logger.info(f"Saved {len(tensors_to_save)} cached embeddings to {cache_file}")

    def load_cache_from_disk(self):
        """Load cache from disk using memory-mapped safetensors."""
        if not self.cache_dir or self.cache is None:
            return

        cache_file = os.path.join(self.cache_dir, "cache.safetensors")

        if not os.path.exists(cache_file):
            return

        try:
            # Open safetensors file in context manager
            with safe_open(cache_file, framework="pt", device="cpu") as f:
                loaded_count = 0
                for key in f.keys():
                    self.cache[key] = f.get_tensor(key)
                    loaded_count += 1

                logger.debug(
                    f"Loaded {loaded_count} cached embeddings from {cache_file} (memory-mapped)"
                )
        except Exception as e:
            logger.warning(f"Failed to load cache from disk: {e}")

    def clear_cache_from_disk(self):
        """Clear cached safetensors file from disk."""
        if not self.cache_dir:
            return

        cache_file = os.path.join(self.cache_dir, "cache.safetensors")
        if os.path.exists(cache_file):
            os.remove(cache_file)
            logger.info(f"Cleared cache file: {cache_file}")

        # Also clear the in-memory cache
        if self.cache:
            self.cache.clear()

    def get_cache_size(self):
        """Get the current cache size in MB."""
        if not self.cache_dir:
            return 0

        cache_file = os.path.join(self.cache_dir, "cache.safetensors")
        if os.path.exists(cache_file):
            return os.path.getsize(cache_file) / (1024 * 1024)  # Convert to MB
        return 0

    @staticmethod
    def _to_cache_key(prompt: str) -> str:
        """Generate a hash key for a single prompt string."""
        return hashlib.sha256(prompt.encode()).hexdigest()

    @staticmethod
    def _get_cached_prompt_embeds(prompts, cache_instance, cache_suffix, device=None):
        """Split prompts into cached and new, returning indices for reconstruction.

        Args:
            prompts: List of prompt strings to check against cache.
            cache_instance: CachedFluxTextEncoderStep instance with cache, or None.
            cache_suffix: Suffix to append to cache keys (e.g., "_t5", "_clip").
            device: Optional device to move cached tensors to.

        Returns:
            tuple: (cached_embeds, prompts_to_encode, prompt_indices)
                - cached_embeds: List of (idx, embedding) tuples for cached prompts
                - prompts_to_encode: List of prompts that need encoding
                - prompt_indices: List of original indices for prompts_to_encode
        """
        cached_embeds = []
        prompts_to_encode = []
        prompt_indices = []

        for idx, prompt in enumerate(prompts):
            cache_key = CachedFluxTextEncoderStep._to_cache_key(prompt + cache_suffix)
            if (
                cache_instance
                and cache_instance.cache
                and cache_key in cache_instance.cache
            ):
                cached_tensor = cache_instance.cache[cache_key]
                # Move tensor to the correct device if specified
                if device is not None and cached_tensor.device != device:
                    cached_tensor = cached_tensor.to(device)
                cached_embeds.append((idx, cached_tensor))
            else:
                prompts_to_encode.append(prompt)
                prompt_indices.append(idx)

        return cached_embeds, prompts_to_encode, prompt_indices

    @staticmethod
    def _cache_prompt_embeds(
        prompts, prompt_indices, prompt_embeds, cache_instance, cache_suffix
    ):
        """Store newly computed embeddings in cache and save to disk.

        Args:
            prompts: Original full list of prompts.
            prompt_indices: Indices of newly encoded prompts in the original list.
            prompt_embeds: Newly computed embeddings tensor.
            cache_instance: CachedFluxTextEncoderStep instance with cache, or None.
            cache_suffix: Suffix to append to cache keys (e.g., "_t5", "_clip").
        """
        if not cache_instance or cache_instance.cache is None:
            return

        for i, idx in enumerate(prompt_indices):
            cache_key = CachedFluxTextEncoderStep._to_cache_key(
                prompts[idx] + cache_suffix
            )
            # Store in memory cache on CPU to save GPU memory
            tensor_slice = prompt_embeds[i : i + 1]
            cache_instance.cache[cache_key] = tensor_slice

            # Save updated cache to disk
            cache_instance.save_cache_to_disk()

    @staticmethod
    def _merge_cached_prompt_embeds(
        cached_embeds, prompt_indices, prompt_embeds, batch_size
    ):
        """Merge cached and newly computed embeddings back into original batch order.

        Args:
            cached_embeds: List of (idx, embedding) tuples from cache.
            prompt_indices: Indices where new embeddings should be placed.
            prompt_embeds: Newly computed embeddings tensor, or None if all cached.
            batch_size: Total batch size for output tensor.

        Returns:
            torch.Tensor: Combined embeddings tensor in correct batch order.
        """
        all_embeds = [None] * batch_size

        # Place cached embeddings
        for idx, embed in cached_embeds:
            all_embeds[idx] = embed

        # Place new embeddings
        if prompt_embeds is not None:
            for i, idx in enumerate(prompt_indices):
                all_embeds[idx] = prompt_embeds[i : i + 1]

        return torch.cat(all_embeds, dim=0)

    @staticmethod
    def _get_t5_prompt_embeds(
        components,
        prompt: Union[str, List[str]] = None,
        num_images_per_prompt: int = 1,
        max_sequence_length: int = 512,
        device: torch.device = None,
        cache_instance=None,
    ):
        """Encode prompts using T5 text encoder with caching support.

        Args:
            components: Pipeline components containing T5 encoder and tokenizer.
            prompt: Prompt(s) to encode.
            num_images_per_prompt: Number of images per prompt for duplication.
            max_sequence_length: Maximum sequence length for tokenization.
            device: Device to place tensors on.
            cache_instance: CachedFluxTextEncoderStep instance for caching, or None.

        Returns:
            torch.Tensor: T5 prompt embeddings ready for diffusion model.
        """
        dtype = components.text_encoder_2.dtype
        prompt = [prompt] if isinstance(prompt, str) else prompt
        batch_size = len(prompt)

        cached_embeds, prompts_to_encode, prompt_indices = (
            CachedFluxTextEncoderStep._get_cached_prompt_embeds(
                prompt, cache_instance, "_t5", device
            )
        )

        if not prompts_to_encode:
            prompt_embeds = CachedFluxTextEncoderStep._merge_cached_prompt_embeds(
                cached_embeds, prompt_indices, None, batch_size
            )
            _, seq_len, _ = prompt_embeds.shape

            prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
            prompt_embeds = prompt_embeds.view(
                batch_size * num_images_per_prompt, seq_len, -1
            )
            return prompt_embeds

        if isinstance(components, TextualInversionLoaderMixin):
            prompts_to_encode = components.maybe_convert_prompt(
                prompts_to_encode, components.tokenizer_2
            )

        text_inputs = components.tokenizer_2(
            prompts_to_encode,
            padding="max_length",
            max_length=max_sequence_length,
            truncation=True,
            return_length=False,
            return_overflowing_tokens=False,
            return_tensors="pt",
        )
        text_input_ids = text_inputs.input_ids

        # Check for truncation
        untruncated_ids = components.tokenizer_2(
            prompts_to_encode, padding="longest", return_tensors="pt"
        ).input_ids
        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
            text_input_ids, untruncated_ids
        ):
            removed_text = components.tokenizer_2.batch_decode(
                untruncated_ids[:, max_sequence_length - 1 : -1]
            )
            logger.warning(
                "The following part of your input was truncated because `max_sequence_length` is set to "
                f" {max_sequence_length} tokens: {removed_text}"
            )

        prompt_embeds = components.text_encoder_2(
            text_input_ids.to(device), output_hidden_states=False
        )[0]

        CachedFluxTextEncoderStep._cache_prompt_embeds(
            prompt, prompt_indices, prompt_embeds, cache_instance, "_t5"
        )

        prompt_embeds = CachedFluxTextEncoderStep._merge_cached_prompt_embeds(
            cached_embeds, prompt_indices, prompt_embeds, batch_size
        )
        _, seq_len, _ = prompt_embeds.shape

        # Duplicate for num_images_per_prompt
        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
        prompt_embeds = prompt_embeds.view(
            batch_size * num_images_per_prompt, seq_len, -1
        )
        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)

        return prompt_embeds

    @staticmethod
    def _get_clip_prompt_embeds(
        components,
        prompt: Union[str, List[str]] = None,
        num_images_per_prompt: int = 1,
        device: torch.device = None,
        cache_instance=None,
    ):
        """Encode prompts using CLIP text encoder with caching support.

        Args:
            components: Pipeline components containing CLIP encoder and tokenizer.
            prompt: Prompt(s) to encode.
            num_images_per_prompt: Number of images per prompt for duplication.
            device: Device to place tensors on.
            cache_instance: CachedFluxTextEncoderStep instance for caching, or None.

        Returns:
            torch.Tensor: CLIP pooled prompt embeddings ready for diffusion model.
        """
        prompt = [prompt] if isinstance(prompt, str) else prompt
        batch_size = len(prompt)

        # Split cached and new prompts
        cached_embeds, prompts_to_encode, prompt_indices = (
            CachedFluxTextEncoderStep._get_cached_prompt_embeds(
                prompt, cache_instance, "_clip", device
            )
        )

        # Early return if all prompts are cached
        if not prompts_to_encode:
            prompt_embeds = CachedFluxTextEncoderStep._merge_cached_prompt_embeds(
                cached_embeds, prompt_indices, None, batch_size
            )
            prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
            prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
            return prompt_embeds

        if prompts_to_encode:
            if isinstance(components, TextualInversionLoaderMixin):
                prompts_to_encode = components.maybe_convert_prompt(
                    prompts_to_encode, components.tokenizer
                )

            text_inputs = components.tokenizer(
                prompts_to_encode,
                padding="max_length",
                max_length=components.tokenizer.model_max_length,
                truncation=True,
                return_overflowing_tokens=False,
                return_length=False,
                return_tensors="pt",
            )

            text_input_ids = text_inputs.input_ids
            tokenizer_max_length = components.tokenizer.model_max_length
            untruncated_ids = components.tokenizer(
                prompts_to_encode, padding="longest", return_tensors="pt"
            ).input_ids

            if untruncated_ids.shape[-1] >= text_input_ids.shape[
                -1
            ] and not torch.equal(text_input_ids, untruncated_ids):
                removed_text = components.tokenizer.batch_decode(
                    untruncated_ids[:, tokenizer_max_length - 1 : -1]
                )
                logger.warning(
                    "The following part of your input was truncated because CLIP can only handle sequences up to"
                    f" {tokenizer_max_length} tokens: {removed_text}"
                )

            prompt_embeds = components.text_encoder(
                text_input_ids.to(device), output_hidden_states=False
            )

            # Use pooled output of CLIPTextModel
            prompt_embeds = prompt_embeds.pooler_output
            prompt_embeds = prompt_embeds.to(
                dtype=components.text_encoder.dtype, device=device
            )

            # Cache the new embeddings
            CachedFluxTextEncoderStep._cache_prompt_embeds(
                prompt, prompt_indices, prompt_embeds, cache_instance, "_clip"
            )

        # Combine cached and newly encoded embeddings in correct order
        prompt_embeds = CachedFluxTextEncoderStep._merge_cached_prompt_embeds(
            cached_embeds,
            prompt_indices,
            prompt_embeds if prompts_to_encode else None,
            batch_size,
        )

        # Duplicate for num_images_per_prompt
        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)

        return prompt_embeds

    @staticmethod
    def encode_prompt(
        components,
        prompt: Union[str, List[str]] = None,
        prompt_2: Union[str, List[str]] = None,
        device: Optional[torch.device] = None,
        num_images_per_prompt: int = 1,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        max_sequence_length: int = 512,
        lora_scale: Optional[float] = None,
        cache_instance: Optional["CachedFluxTextEncoderStep"] = None,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in all text-encoders
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
        """
        device = device or components._execution_device

        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
        if lora_scale is not None and isinstance(components, FluxLoraLoaderMixin):
            components._lora_scale = lora_scale

            # dynamically adjust the LoRA scale
            if components.text_encoder is not None and USE_PEFT_BACKEND:
                scale_lora_layers(components.text_encoder, lora_scale)
            if components.text_encoder_2 is not None and USE_PEFT_BACKEND:
                scale_lora_layers(components.text_encoder_2, lora_scale)

        prompt = [prompt] if isinstance(prompt, str) else prompt

        if prompt_embeds is None:
            prompt_2 = prompt_2 or prompt
            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2

            # We only use the pooled prompt output from the CLIPTextModel
            pooled_prompt_embeds = CachedFluxTextEncoderStep._get_clip_prompt_embeds(
                components,
                prompt=prompt,
                device=device,
                num_images_per_prompt=num_images_per_prompt,
                cache_instance=cache_instance,
            )
            prompt_embeds = CachedFluxTextEncoderStep._get_t5_prompt_embeds(
                components,
                prompt=prompt_2,
                num_images_per_prompt=num_images_per_prompt,
                max_sequence_length=max_sequence_length,
                device=device,
                cache_instance=cache_instance,
            )

        if components.text_encoder is not None:
            if isinstance(components, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
                # Retrieve the original scale by scaling back the LoRA layers
                unscale_lora_layers(components.text_encoder, lora_scale)

        if components.text_encoder_2 is not None:
            if isinstance(components, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
                # Retrieve the original scale by scaling back the LoRA layers
                unscale_lora_layers(components.text_encoder_2, lora_scale)

        dtype = (
            components.text_encoder.dtype
            if components.text_encoder is not None
            else torch.bfloat16
        )
        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)

        return prompt_embeds, pooled_prompt_embeds, text_ids

    @torch.no_grad()
    def __call__(
        self, components: FluxModularPipeline, state: PipelineState
    ) -> PipelineState:
        # Get inputs and intermediates
        block_state = self.get_block_state(state)
        self.check_inputs(block_state)

        block_state.device = components._execution_device

        # Encode input prompt
        block_state.text_encoder_lora_scale = (
            block_state.joint_attention_kwargs.get("scale", None)
            if block_state.joint_attention_kwargs is not None
            else None
        )
        (
            block_state.prompt_embeds,
            block_state.pooled_prompt_embeds,
            block_state.text_ids,
        ) = self.encode_prompt(
            components,
            prompt=block_state.prompt,
            prompt_2=None,
            prompt_embeds=None,
            pooled_prompt_embeds=None,
            device=block_state.device,
            num_images_per_prompt=1,  # TODO: hardcoded for now.
            max_sequence_length=512,
            lora_scale=block_state.text_encoder_lora_scale,
            cache_instance=self
            if self.cache is not None
            else None,  # Pass self as cache_instance
        )

        # Add outputs
        self.set_block_state(state, block_state)
        return components, state