Upload FOFPred pipeline

by kahnchana - opened Dec 17, 2025

base: refs/heads/main

←

from: refs/pr/6

Discussion Files changed

+4424

-33

Files changed (12) hide show

.gitattributes +1 -0
README.md +13 -5
__pycache__/pipeline_fofpred.cpython-311.pyc +0 -0
__pycache__/scheduler_fofpred.cpython-311.pyc +0 -0
__pycache__/transformer_fofpred.cpython-311.pyc +3 -0
model_index.json +2 -3
pipeline_fofpred.py +973 -9
scheduler/scheduler_config.json +1 -14
scheduler_fofpred.py +218 -0
transformer/config.json +1 -1
transformer_fofpred.py +0 -0
vae/config.json +1 -1

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 processor/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 processor/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+__pycache__/transformer_fofpred.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -15,18 +15,20 @@ tags:
 ## Usage
 ```python
 import torch
-from fofpred.pipelines.fofpred.pipeline_fofpred import FOFPredPipeline
-from fofpred.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
 from PIL import Image
-pipeline = FOFPredPipeline.from_pretrained(
     "Salesforce/FOFPred",
     torch_dtype=torch.bfloat16,
 ).to("cuda")
-pipeline.scheduler = FlowMatchEulerDiscreteScheduler()
 results = pipeline(
     prompt="Moving the water bottle from right to left.",
     input_images=[Image.open("your_image.jpg")],
@@ -40,6 +42,12 @@ results = pipeline(
 )
 flow_frames = results.images  # [B, F, C, H, W]
 ```
 ## Architecture

 ## Usage
 ```python
+import einops
+import numpy as np
 import torch
+from diffusers import DiffusionPipeline
 from PIL import Image
+# Load pipeline with trust_remote_code
+pipeline = DiffusionPipeline.from_pretrained(
     "Salesforce/FOFPred",
     torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
 ).to("cuda")
+# Run inference
 results = pipeline(
     prompt="Moving the water bottle from right to left.",
     input_images=[Image.open("your_image.jpg")],
 )
 flow_frames = results.images  # [B, F, C, H, W]
+output_tensor = flow_frames[0]  # [F, C, H, W]
+output_np = pipeline.image_processor.pt_to_numpy(output_tensor)  # [F, H, W, C]
+reshaped = einops.rearrange(output_np, "f h w c -> h (f w) c")
+img = Image.fromarray((reshaped * 255).astype(np.uint8))
+img.save("output_combined.png")
 ```
 ## Architecture

__pycache__/pipeline_fofpred.cpython-311.pyc ADDED Viewed

Binary file (88.8 kB). View file

__pycache__/scheduler_fofpred.cpython-311.pyc ADDED Viewed

Binary file (10.3 kB). View file

__pycache__/transformer_fofpred.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4194813ba36a92b72a9fc5e90a0257d61096743c4e5bb6800f3c6683b3774510
+size 124604

model_index.json CHANGED Viewed

@@ -4,7 +4,6 @@
     "FOFPredPipeline"
   ],
   "_diffusers_version": "0.34.0",
-  "_name_or_path": "/export/home/public_repo/FOFPred/pretrained_models/hf_upload",
   "mllm": [
     "transformers",
     "Qwen2_5_VLForConditionalGeneration"
@@ -14,11 +13,11 @@
     "Qwen2_5_VLProcessor"
   ],
   "scheduler": [
-    "diffusers",
     "FlowMatchEulerDiscreteScheduler"
   ],
   "transformer": [
-    "transformer_omnigen2",
     "OmniGen2Transformer3DModel"
   ],
   "vae": [

     "FOFPredPipeline"
   ],
   "_diffusers_version": "0.34.0",
   "mllm": [
     "transformers",
     "Qwen2_5_VLForConditionalGeneration"
     "Qwen2_5_VLProcessor"
   ],
   "scheduler": [
+    "scheduler_fofpred",
     "FlowMatchEulerDiscreteScheduler"
   ],
   "transformer": [
+    "transformer_fofpred",
     "OmniGen2Transformer3DModel"
   ],
   "vae": [

pipeline_fofpred.py CHANGED Viewed

@@ -17,39 +17,1003 @@ limitations under the License.
 """
 import inspect
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import PIL.Image
 import torch
 import torch.nn.functional as F
 from diffusers.models.autoencoders import AutoencoderKL
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.utils import (
     BaseOutput,
     is_torch_xla_available,
     logging,
 )
 from diffusers.utils.torch_utils import randn_tensor
 from transformers import Qwen2_5_VLForConditionalGeneration
-from fofpred.pipelines.image_processor import OmniGen2ImageProcessor
-from fofpred.utils.teacache_util import TeaCacheParams
-from ...models.transformers import OmniGen2Transformer3DModel
-from ...models.transformers.repo import OmniGen2RotaryPosEmbed
-from ..lora_pipeline import OmniGen2LoraLoaderMixin
 if is_torch_xla_available():
     XLA_AVAILABLE = True
 else:
     XLA_AVAILABLE = False
-from ...cache_functions import cache_init
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 @dataclass

 """
 import inspect
+import os
+import warnings
 from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import numpy as np
 import PIL.Image
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
+from diffusers.configuration_utils import register_to_config
+from diffusers.image_processor import (
+    PipelineImageInput,
+    VaeImageProcessor,
+    is_valid_image_imagelist,
+)
+from diffusers.loaders.lora_base import (  # noqa
+    LoraBaseMixin,
+    _fetch_state_dict,
+)
+from diffusers.loaders.lora_conversion_utils import (
+    _convert_non_diffusers_lumina2_lora_to_diffusers,
+)
 from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.embeddings import get_1d_rotary_pos_embed
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.utils import (
+    USE_PEFT_BACKEND,
     BaseOutput,
+    is_peft_available,
+    is_peft_version,
+    is_torch_version,
     is_torch_xla_available,
+    is_transformers_available,
+    is_transformers_version,
     logging,
 )
 from diffusers.utils.torch_utils import randn_tensor
+from einops import repeat
+from huggingface_hub.utils import validate_hf_hub_args
 from transformers import Qwen2_5_VLForConditionalGeneration
+from .scheduler_fofpred import FlowMatchEulerDiscreteScheduler
+from .transformer_fofpred import OmniGen2Transformer3DModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+_LOW_CPU_MEM_USAGE_DEFAULT_LORA = False
+if is_torch_version(">=", "1.9.0"):
+    if (
+        is_peft_available()
+        and is_peft_version(">=", "0.13.1")
+        and is_transformers_available()
+        and is_transformers_version(">", "4.45.2")
+    ):
+        _LOW_CPU_MEM_USAGE_DEFAULT_LORA = True
 if is_torch_xla_available():
     XLA_AVAILABLE = True
 else:
     XLA_AVAILABLE = False
+TRANSFORMER_NAME = "transformer"
+class OmniGen2ImageProcessor(VaeImageProcessor):
+    """
+    Image processor for PixArt image resize and crop.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
+            `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+        do_binarize (`bool`, *optional*, defaults to `False`):
+            Whether to binarize the image to 0/1.
+        do_convert_rgb (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to RGB format.
+        do_convert_grayscale (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to grayscale format.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 16,
+        resample: str = "lanczos",
+        max_pixels: Optional[int] = None,
+        max_side_length: Optional[int] = None,
+        do_normalize: bool = True,
+        do_binarize: bool = False,
+        do_convert_grayscale: bool = False,
+    ):
+        super().__init__(
+            do_resize=do_resize,
+            vae_scale_factor=vae_scale_factor,
+            resample=resample,
+            do_normalize=do_normalize,
+            do_binarize=do_binarize,
+            do_convert_grayscale=do_convert_grayscale,
+        )
+        self.max_pixels = max_pixels
+        self.max_side_length = max_side_length
+    def get_new_height_width(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        max_side_length: Optional[int] = None,
+    ) -> Tuple[int, int]:
+        r"""
+        Returns the height and width of the image, downscaled to the next integer multiple of `vae_scale_factor`.
+        Args:
+            image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
+                The image input, which can be a PIL image, NumPy array, or PyTorch tensor. If it is a NumPy array, it
+                should have shape `[batch, height, width]` or `[batch, height, width, channels]`. If it is a PyTorch
+                tensor, it should have shape `[batch, channels, height, width]`.
+            height (`Optional[int]`, *optional*, defaults to `None`):
+                The height of the preprocessed image. If `None`, the height of the `image` input will be used.
+            width (`Optional[int]`, *optional*, defaults to `None`):
+                The width of the preprocessed image. If `None`, the width of the `image` input will be used.
+        Returns:
+            `Tuple[int, int]`:
+                A tuple containing the height and width, both resized to the nearest integer multiple of
+                `vae_scale_factor`.
+        """
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[2]
+            else:
+                height = image.shape[1]
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[3]
+            else:
+                width = image.shape[2]
+        if max_side_length is None:
+            max_side_length = self.max_side_length
+        if max_pixels is None:
+            max_pixels = self.max_pixels
+        ratio = 1.0
+        if max_side_length is not None:
+            if height > width:
+                max_side_length_ratio = max_side_length / height
+            else:
+                max_side_length_ratio = max_side_length / width
+        cur_pixels = height * width
+        max_pixels_ratio = (max_pixels / cur_pixels) ** 0.5
+        ratio = min(
+            max_pixels_ratio, max_side_length_ratio, 1.0
+        )  # do not upscale input image
+        new_height, new_width = (
+            int(height * ratio)
+            // self.config.vae_scale_factor
+            * self.config.vae_scale_factor,
+            int(width * ratio)
+            // self.config.vae_scale_factor
+            * self.config.vae_scale_factor,
+        )
+        return new_height, new_width
+    def preprocess(
+        self,
+        image: PipelineImageInput,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        max_side_length: Optional[int] = None,
+        resize_mode: str = "default",  # "default", "fill", "crop"
+        crops_coords: Optional[Tuple[int, int, int, int]] = None,
+    ) -> torch.Tensor:
+        """
+        Preprocess the image input.
+        Args:
+            image (`PipelineImageInput`):
+                The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of
+                supported formats.
+            height (`int`, *optional*):
+                The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
+                height.
+            width (`int`, *optional*):
+                The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
+            resize_mode (`str`, *optional*, defaults to `default`):
+                The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
+                the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
+                resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
+                center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
+                image to fit within the specified width and height, maintaining the aspect ratio, and then center the
+                image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
+                supported for PIL image input.
+            crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
+                The crop coordinates for each image in the batch. If `None`, will not crop the image.
+        Returns:
+            `torch.Tensor`:
+                The preprocessed image.
+        """
+        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+        # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
+        if (
+            self.config.do_convert_grayscale
+            and isinstance(image, (torch.Tensor, np.ndarray))
+            and image.ndim == 3
+        ):
+            if isinstance(image, torch.Tensor):
+                # if image is a pytorch tensor could have 2 possible shapes:
+                #    1. batch x height x width: we should insert the channel dimension at position 1
+                #    2. channel x height x width: we should insert batch dimension at position 0,
+                #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
+                #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
+                image = image.unsqueeze(1)
+            else:
+                # if it is a numpy array, it could have 2 possible shapes:
+                #   1. batch x height x width: insert channel dimension on last position
+                #   2. height x width x channel: insert batch dimension on first position
+                if image.shape[-1] == 1:
+                    image = np.expand_dims(image, axis=0)
+                else:
+                    image = np.expand_dims(image, axis=-1)
+        if (
+            isinstance(image, list)
+            and isinstance(image[0], np.ndarray)
+            and image[0].ndim == 4
+        ):
+            warnings.warn(
+                "Passing `image` as a list of 4d np.ndarray is deprecated."
+                "Please concatenate the list along the batch dimension and pass it as a single 4d np.ndarray",
+                FutureWarning,
+            )
+            image = np.concatenate(image, axis=0)
+        if (
+            isinstance(image, list)
+            and isinstance(image[0], torch.Tensor)
+            and image[0].ndim == 4
+        ):
+            warnings.warn(
+                "Passing `image` as a list of 4d torch.Tensor is deprecated."
+                "Please concatenate the list along the batch dimension and pass it as a single 4d torch.Tensor",
+                FutureWarning,
+            )
+            image = torch.cat(image, axis=0)
+        if not is_valid_image_imagelist(image):
+            raise ValueError(
+                f"Input is in incorrect format. Currently, we only support {', '.join(str(x) for x in supported_formats)}"
+            )
+        if not isinstance(image, list):
+            image = [image]
+        if isinstance(image[0], PIL.Image.Image):
+            if crops_coords is not None:
+                image = [i.crop(crops_coords) for i in image]
+            if self.config.do_resize:
+                height, width = self.get_new_height_width(
+                    image[0], height, width, max_pixels, max_side_length
+                )
+                image = [
+                    self.resize(i, height, width, resize_mode=resize_mode)
+                    for i in image
+                ]
+            if self.config.do_convert_rgb:
+                image = [self.convert_to_rgb(i) for i in image]
+            elif self.config.do_convert_grayscale:
+                image = [self.convert_to_grayscale(i) for i in image]
+            image = self.pil_to_numpy(image)  # to np
+            image = self.numpy_to_pt(image)  # to pt
+        elif isinstance(image[0], np.ndarray):
+            image = (
+                np.concatenate(image, axis=0)
+                if image[0].ndim == 4
+                else np.stack(image, axis=0)
+            )
+            image = self.numpy_to_pt(image)
+            height, width = self.get_new_height_width(
+                image, height, width, max_pixels, max_side_length
+            )
+            if self.config.do_resize:
+                image = self.resize(image, height, width)
+        elif isinstance(image[0], torch.Tensor):
+            image = (
+                torch.cat(image, axis=0)
+                if image[0].ndim == 4
+                else torch.stack(image, axis=0)
+            )
+            if self.config.do_convert_grayscale and image.ndim == 3:
+                image = image.unsqueeze(1)
+            channel = image.shape[1]
+            # don't need any preprocess if the image is latents
+            if channel == self.config.vae_latent_channels:
+                return image
+            height, width = self.get_new_height_width(
+                image, height, width, max_pixels, max_side_length
+            )
+            if self.config.do_resize:
+                image = self.resize(image, height, width)
+        # expected range [0,1], normalize to [-1,1]
+        do_normalize = self.config.do_normalize
+        if do_normalize and image.min() < 0:
+            warnings.warn(
+                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+        if do_normalize:
+            image = self.normalize(image)
+        if self.config.do_binarize:
+            image = self.binarize(image)
+        return image
+@dataclass
+class TeaCacheParams:
+    """
+    TeaCache parameters for `OmniGen2Transformer3DModel`
+    See https://github.com/ali-vilab/TeaCache/ for a more comprehensive understanding
+    Args:
+        previous_residual (Optional[torch.Tensor]):
+            The tensor difference between the output and the input of the transformer layers from the previous timestep.
+        previous_modulated_inp (Optional[torch.Tensor]):
+            The modulated input from the previous timestep used to indicate the change of the transformer layer's output.
+        accumulated_rel_l1_distance (float):
+            The accumulated relative L1 distance.
+        is_first_or_last_step (bool):
+            Whether the current timestep is the first or last step.
+    """
+    previous_residual: Optional[torch.Tensor] = None
+    previous_modulated_inp: Optional[torch.Tensor] = None
+    accumulated_rel_l1_distance: float = 0
+    is_first_or_last_step: bool = False
+class OmniGen2RotaryPosEmbed(nn.Module):
+    def __init__(
+        self,
+        theta: int,
+        axes_dim: Tuple[int, int, int],
+        axes_lens: Tuple[int, int, int] = (300, 512, 512),
+        patch_size: int = 2,
+    ):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+        self.axes_lens = axes_lens
+        self.patch_size = patch_size
+    @staticmethod
+    def get_freqs_cis(
+        axes_dim: Tuple[int, int, int], axes_lens: Tuple[int, int, int], theta: int
+    ) -> List[torch.Tensor]:
+        freqs_cis = []
+        freqs_dtype = (
+            torch.float32 if torch.backends.mps.is_available() else torch.float64
+        )
+        for i, (d, e) in enumerate(zip(axes_dim, axes_lens)):
+            emb = get_1d_rotary_pos_embed(d, e, theta=theta, freqs_dtype=freqs_dtype)
+            freqs_cis.append(emb)
+        return freqs_cis
+    def _get_freqs_cis(self, freqs_cis, ids: torch.Tensor) -> torch.Tensor:
+        device = ids.device
+        if ids.device.type == "mps":
+            ids = ids.to("cpu")
+        result = []
+        for i in range(len(self.axes_dim)):
+            freqs = freqs_cis[i].to(ids.device)
+            index = ids[:, :, i : i + 1].repeat(1, 1, freqs.shape[-1]).to(torch.int64)
+            result.append(
+                torch.gather(
+                    freqs.unsqueeze(0).repeat(index.shape[0], 1, 1), dim=1, index=index
+                )
+            )
+        return torch.cat(result, dim=-1).to(device)
+    def forward(
+        self,
+        freqs_cis,
+        attention_mask,
+        l_effective_ref_img_len,
+        l_effective_img_len,
+        ref_img_sizes,
+        img_sizes,
+        device,
+    ):
+        batch_size = len(attention_mask)
+        p = self.patch_size
+        encoder_seq_len = attention_mask.shape[1]
+        l_effective_cap_len = attention_mask.sum(dim=1).tolist()
+        if isinstance(l_effective_img_len[0], list):  # Check for t-dim case
+            seq_lengths = [
+                cap_len + sum(ref_img_len) + sum(img_len)
+                for cap_len, ref_img_len, img_len in zip(
+                    l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len
+                )
+            ]
+        else:  # Original case
+            seq_lengths = [
+                cap_len + sum(ref_img_len) + img_len
+                for cap_len, ref_img_len, img_len in zip(
+                    l_effective_cap_len, l_effective_ref_img_len, l_effective_img_len
+                )
+            ]
+        max_seq_len = max(seq_lengths)
+        max_ref_img_len = max(
+            [sum(ref_img_len) for ref_img_len in l_effective_ref_img_len]
+        )
+        if isinstance(l_effective_img_len[0], list):
+            max_img_len = max([sum(ln) for ln in l_effective_img_len])
+        else:
+            max_img_len = max(l_effective_img_len)
+        # Create position IDs
+        position_ids = torch.zeros(
+            batch_size, max_seq_len, 3, dtype=torch.int32, device=device
+        )
+        for i, (cap_seq_len, seq_len) in enumerate(
+            zip(l_effective_cap_len, seq_lengths)
+        ):
+            # add text position ids
+            position_ids[i, :cap_seq_len] = repeat(
+                torch.arange(cap_seq_len, dtype=torch.int32, device=device), "l -> l 3"
+            )
+            pe_shift = cap_seq_len
+            pe_shift_len = cap_seq_len
+            if ref_img_sizes[i] is not None:
+                for ref_img_size, ref_img_len in zip(
+                    ref_img_sizes[i], l_effective_ref_img_len[i]
+                ):
+                    H, W = ref_img_size
+                    ref_H_tokens, ref_W_tokens = H // p, W // p
+                    assert ref_H_tokens * ref_W_tokens == ref_img_len
+                    # add image position ids
+                    row_ids = repeat(
+                        torch.arange(ref_H_tokens, dtype=torch.int32, device=device),
+                        "h -> h w",
+                        w=ref_W_tokens,
+                    ).flatten()
+                    col_ids = repeat(
+                        torch.arange(ref_W_tokens, dtype=torch.int32, device=device),
+                        "w -> h w",
+                        h=ref_H_tokens,
+                    ).flatten()
+                    position_ids[i, pe_shift_len : pe_shift_len + ref_img_len, 0] = (
+                        pe_shift
+                    )
+                    position_ids[i, pe_shift_len : pe_shift_len + ref_img_len, 1] = (
+                        row_ids
+                    )
+                    position_ids[i, pe_shift_len : pe_shift_len + ref_img_len, 2] = (
+                        col_ids
+                    )
+                    pe_shift += max(ref_H_tokens, ref_W_tokens)
+                    pe_shift_len += ref_img_len
+            if isinstance(l_effective_img_len[i], list):  # New case
+                for img_size, img_len in zip(img_sizes[i], l_effective_img_len[i]):
+                    H, W = img_size
+                    H_tokens, W_tokens = H // p, W // p
+                    assert H_tokens * W_tokens == img_len
+                    row_ids = repeat(
+                        torch.arange(H_tokens, dtype=torch.int32, device=device),
+                        "h -> h w",
+                        w=W_tokens,
+                    ).flatten()
+                    col_ids = repeat(
+                        torch.arange(W_tokens, dtype=torch.int32, device=device),
+                        "w -> h w",
+                        h=H_tokens,
+                    ).flatten()
+                    end_idx = pe_shift_len + img_len
+                    position_ids[i, pe_shift_len:end_idx, 0] = pe_shift
+                    position_ids[i, pe_shift_len:end_idx, 1] = row_ids
+                    position_ids[i, pe_shift_len:end_idx, 2] = col_ids
+                    pe_shift += max(H_tokens, W_tokens)
+                    pe_shift_len = end_idx
+            else:  # Original case
+                H, W = img_sizes[i]
+                H_tokens, W_tokens = H // p, W // p
+                assert H_tokens * W_tokens == l_effective_img_len[i]
+                row_ids = repeat(
+                    torch.arange(H_tokens, dtype=torch.int32, device=device),
+                    "h -> h w",
+                    w=W_tokens,
+                ).flatten()
+                col_ids = repeat(
+                    torch.arange(W_tokens, dtype=torch.int32, device=device),
+                    "w -> h w",
+                    h=H_tokens,
+                ).flatten()
+                assert pe_shift_len + l_effective_img_len[i] == seq_len
+                position_ids[i, pe_shift_len:seq_len, 0] = pe_shift
+                position_ids[i, pe_shift_len:seq_len, 1] = row_ids
+                position_ids[i, pe_shift_len:seq_len, 2] = col_ids
+        # Get combined rotary embeddings
+        freqs_cis = self._get_freqs_cis(freqs_cis, position_ids)
+        # create separate rotary embeddings for captions and images
+        cap_freqs_cis = torch.zeros(
+            batch_size,
+            encoder_seq_len,
+            freqs_cis.shape[-1],
+            device=device,
+            dtype=freqs_cis.dtype,
+        )
+        ref_img_freqs_cis = torch.zeros(
+            batch_size,
+            max_ref_img_len,
+            freqs_cis.shape[-1],
+            device=device,
+            dtype=freqs_cis.dtype,
+        )
+        img_freqs_cis = torch.zeros(
+            batch_size,
+            max_img_len,
+            freqs_cis.shape[-1],
+            device=device,
+            dtype=freqs_cis.dtype,
+        )
+        for i, (cap_seq_len, ref_img_len, img_len, seq_len) in enumerate(
+            zip(
+                l_effective_cap_len,
+                l_effective_ref_img_len,
+                l_effective_img_len,
+                seq_lengths,
+            )
+        ):
+            cap_freqs_cis[i, :cap_seq_len] = freqs_cis[i, :cap_seq_len]
+            ref_img_freqs_cis[i, : sum(ref_img_len)] = freqs_cis[
+                i, cap_seq_len : cap_seq_len + sum(ref_img_len)
+            ]
+            if isinstance(img_len, list):
+                img_len = sum(img_len)
+            img_freqs_cis[i, :img_len] = freqs_cis[
+                i,
+                cap_seq_len + sum(ref_img_len) : cap_seq_len
+                + sum(ref_img_len)
+                + img_len,
+            ]
+        return (
+            cap_freqs_cis,
+            ref_img_freqs_cis,
+            img_freqs_cis,
+            freqs_cis,
+            l_effective_cap_len,
+            seq_lengths,
+        )
+class OmniGen2LoraLoaderMixin(LoraBaseMixin):
+    r"""
+    Load LoRA layers into [`OmniGen2Transformer3DModel`]. Specific to [`FOFPredPipeline`].
+    """
+    _lora_loadable_modules = ["transformer"]
+    transformer_name = TRANSFORMER_NAME
+    @classmethod
+    @validate_hf_hub_args
+    def lora_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        r"""
+        Return state dict for lora weights and the network alphas.
+        <Tip warning={true}>
+        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
+        This function is experimental and might change in the future.
+        </Tip>
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+        """
+        # Load the main state dict first which has the LoRA layers for either of
+        # transformer and text encoder or both.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        state_dict = _fetch_state_dict(
+            pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
+            weight_name=weight_name,
+            use_safetensors=use_safetensors,
+            local_files_only=local_files_only,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            allow_pickle=allow_pickle,
+        )
+        is_dora_scale_present = any("dora_scale" in k for k in state_dict)
+        if is_dora_scale_present:
+            warn_msg = "It seems like you are using a DoRA checkpoint that is not compatible in Diffusers at the moment. So, we are going to filter out the keys associated to 'dora_scale` from the state dict. If you think this is a mistake please open an issue https://github.com/huggingface/diffusers/issues/new."
+            logger.warning(warn_msg)
+            state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}
+        # conversion.
+        non_diffusers = any(k.startswith("diffusion_model.") for k in state_dict)
+        if non_diffusers:
+            state_dict = _convert_non_diffusers_lumina2_lora_to_diffusers(state_dict)
+        return state_dict
+    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.load_lora_weights
+    def load_lora_weights(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        adapter_name=None,
+        **kwargs,
+    ):
+        """
+        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and
+        `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See
+        [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
+        See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state
+        dict is loaded into `self.transformer`.
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+            low_cpu_mem_usage (`bool`, *optional*):
+                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
+                weights.
+            kwargs (`dict`, *optional*):
+                See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`].
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+        low_cpu_mem_usage = kwargs.pop(
+            "low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT_LORA
+        )
+        if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
+            raise ValueError(
+                "`low_cpu_mem_usage=True` is not compatible with this `peft` version. Please update it with `pip install -U peft`."
+            )
+        # if a dict is passed, copy it instead of modifying it inplace
+        if isinstance(pretrained_model_name_or_path_or_dict, dict):
+            pretrained_model_name_or_path_or_dict = (
+                pretrained_model_name_or_path_or_dict.copy()
+            )
+        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
+        state_dict = self.lora_state_dict(
+            pretrained_model_name_or_path_or_dict, **kwargs
+        )
+        is_correct_format = all("lora" in key for key in state_dict.keys())
+        if not is_correct_format:
+            raise ValueError("Invalid LoRA checkpoint.")
+        self.load_lora_into_transformer(
+            state_dict,
+            transformer=getattr(self, self.transformer_name)
+            if not hasattr(self, "transformer")
+            else self.transformer,
+            adapter_name=adapter_name,
+            _pipeline=self,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+        )
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.SD3LoraLoaderMixin.load_lora_into_transformer with SD3Transformer2DModel->Lumina2Transformer2DModel
+    def load_lora_into_transformer(
+        cls,
+        state_dict,
+        transformer,
+        adapter_name=None,
+        _pipeline=None,
+        low_cpu_mem_usage=False,
+        hotswap: bool = False,
+    ):
+        """
+        This will load the LoRA layers specified in `state_dict` into `transformer`.
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
+                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
+                encoder lora layers.
+            transformer (`Lumina2Transformer2DModel`):
+                The Transformer model to load the LoRA layers into.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+            low_cpu_mem_usage (`bool`, *optional*):
+                Speed up model loading by only loading the pretrained LoRA weights and not initializing the random
+                weights.
+            hotswap : (`bool`, *optional*)
+                Defaults to `False`. Whether to substitute an existing (LoRA) adapter with the newly loaded adapter
+                in-place. This means that, instead of loading an additional adapter, this will take the existing
+                adapter weights and replace them with the weights of the new adapter. This can be faster and more
+                memory efficient. However, the main advantage of hotswapping is that when the model is compiled with
+                torch.compile, loading the new adapter does not require recompilation of the model. When using
+                hotswapping, the passed `adapter_name` should be the name of an already loaded adapter.
+                If the new adapter and the old adapter have different ranks and/or LoRA alphas (i.e. scaling), you need
+                to call an additional method before loading the adapter:
+                ```py
+                pipeline = ...  # load diffusers pipeline
+                max_rank = ...  # the highest rank among all LoRAs that you want to load
+                # call *before* compiling and loading the LoRA adapter
+                pipeline.enable_lora_hotswap(target_rank=max_rank)
+                pipeline.load_lora_weights(file_name)
+                # optionally compile the model now
+                ```
+                Note that hotswapping adapters of the text encoder is not yet supported. There are some further
+                limitations to this technique, which are documented here:
+                https://huggingface.co/docs/peft/main/en/package_reference/hotswap
+        """
+        if low_cpu_mem_usage and is_peft_version("<", "0.13.0"):
+            raise ValueError(
+                "`low_cpu_mem_usage=True` is not compatible with this `peft` version. Please update it with `pip install -U peft`."
+            )
+        # Load the layers corresponding to transformer.
+        logger.info(f"Loading {cls.transformer_name}.")
+        transformer.load_lora_adapter(
+            state_dict,
+            network_alphas=None,
+            adapter_name=adapter_name,
+            _pipeline=_pipeline,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            hotswap=hotswap,
+        )
+    @classmethod
+    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.save_lora_weights
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        transformer_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        r"""
+        Save the LoRA parameters corresponding to the UNet and text encoder.
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save LoRA parameters to. Will be created if it doesn't exist.
+            transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `transformer`.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+        """
+        state_dict = {}
+        if not transformer_lora_layers:
+            raise ValueError("You must pass `transformer_lora_layers`.")
+        if transformer_lora_layers:
+            state_dict.update(
+                cls.pack_weights(transformer_lora_layers, cls.transformer_name)
+            )
+        # Save the model
+        cls.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+    # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.fuse_lora
+    def fuse_lora(
+        self,
+        components: List[str] = ["transformer"],
+        lora_scale: float = 1.0,
+        safe_fusing: bool = False,
+        adapter_names: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        r"""
+        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
+        <Tip warning={true}>
+        This is an experimental API.
+        </Tip>
+        Args:
+            components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into.
+            lora_scale (`float`, defaults to 1.0):
+                Controls how much to influence the outputs with the LoRA parameters.
+            safe_fusing (`bool`, defaults to `False`):
+                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
+            adapter_names (`List[str]`, *optional*):
+                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
+        Example:
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.fuse_lora(lora_scale=0.7)
+        ```
+        """
+        super().fuse_lora(
+            components=components,
+            lora_scale=lora_scale,
+            safe_fusing=safe_fusing,
+            adapter_names=adapter_names,
+            **kwargs,
+        )
+    # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.unfuse_lora
+    def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs):
+        r"""
+        Reverses the effect of
+        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora).
+        <Tip warning={true}>
+        This is an experimental API.
+        </Tip>
+        Args:
+            components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from.
+            unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+        """
+        super().unfuse_lora(components=components, **kwargs)
+def cache_init(self, num_steps: int):
+    """
+    Initialization for cache.
+    """
+    cache_dic = {}
+    cache = {}
+    cache_index = {}
+    cache[-1] = {}
+    cache_index[-1] = {}
+    cache_index["layer_index"] = {}
+    cache[-1]["layers_stream"] = {}
+    cache_dic["cache_counter"] = 0
+    for j in range(len(self.transformer.layers)):
+        cache[-1]["layers_stream"][j] = {}
+        cache_index[-1][j] = {}
+    cache_dic["Delta-DiT"] = False
+    cache_dic["cache_type"] = "random"
+    cache_dic["cache_index"] = cache_index
+    cache_dic["cache"] = cache
+    cache_dic["fresh_ratio_schedule"] = "ToCa"
+    cache_dic["fresh_ratio"] = 0.0
+    cache_dic["fresh_threshold"] = 3
+    cache_dic["soft_fresh_weight"] = 0.0
+    cache_dic["taylor_cache"] = True
+    cache_dic["max_order"] = 4
+    cache_dic["first_enhance"] = 5
+    current = {}
+    current["activated_steps"] = [0]
+    current["step"] = 0
+    current["num_steps"] = num_steps
+    return cache_dic, current
 @dataclass

scheduler/scheduler_config.json CHANGED Viewed

@@ -1,18 +1,5 @@
 {
   "_class_name": "FlowMatchEulerDiscreteScheduler",
   "_diffusers_version": "0.34.0",
-  "base_image_seq_len": 256,
-  "base_shift": 0.5,
-  "invert_sigmas": false,
-  "max_image_seq_len": 4096,
-  "max_shift": 1.15,
-  "num_train_timesteps": 1000,
-  "shift": 1.0,
-  "shift_terminal": null,
-  "stochastic_sampling": false,
-  "time_shift_type": "exponential",
-  "use_beta_sigmas": false,
-  "use_dynamic_shifting": false,
-  "use_exponential_sigmas": false,
-  "use_karras_sigmas": false
 }

 {
   "_class_name": "FlowMatchEulerDiscreteScheduler",
   "_diffusers_version": "0.34.0",
+  "num_train_timesteps": 1000
 }

scheduler_fofpred.py ADDED Viewed

	@@ -0,0 +1,218 @@

+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders.lora_base import (  # noqa
+    LoraBaseMixin,
+    _fetch_state_dict,
+)
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import BaseOutput
+@dataclass
+class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+    prev_sample: torch.FloatTensor
+class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+    """
+    _compatibles = []
+    order = 1
+    @register_to_config
+    def __init__(
+        self, num_train_timesteps: int = 1000, dynamic_time_shift: bool = True
+    ):
+        timesteps = torch.linspace(0, 1, num_train_timesteps + 1, dtype=torch.float32)[
+            :-1
+        ]
+        self.timesteps = timesteps
+        self._step_index = None
+        self._begin_index = None
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self._timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    # def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+    #     return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[float]] = None,
+        num_tokens: Optional[int] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        if timesteps is None:
+            self.num_inference_steps = num_inference_steps
+            timesteps = np.linspace(0, 1, num_inference_steps + 1, dtype=np.float32)[
+                :-1
+            ]
+            if self.config.dynamic_time_shift and num_tokens is not None:
+                m = (
+                    np.sqrt(num_tokens) / 40
+                )  # when input resolution is 320 * 320, m = 1, when input resolution is 1024 * 1024, m = 3.2
+                timesteps = timesteps / (m - m * timesteps + timesteps)
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32, device=device)
+        _timesteps = torch.cat([timesteps, torch.ones(1, device=timesteps.device)])
+        self.timesteps = timesteps
+        self._timesteps = _timesteps
+        self._step_index = None
+        self._begin_index = None
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        t = self._timesteps[self.step_index]
+        t_next = self._timesteps[self.step_index + 1]
+        prev_sample = sample + (t_next - t) * model_output
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
+    def __len__(self):
+        return self.config.num_train_timesteps

transformer/config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "_class_name": "OmniGen2Transformer3DModel",
   "_diffusers_version": "0.34.0",
-  "_name_or_path": "/export/home/public_repo/FOFPred/pretrained_models/hf_upload/transformer",
   "axes_dim_rope": [
     40,
     40,

 {
   "_class_name": "OmniGen2Transformer3DModel",
   "_diffusers_version": "0.34.0",
+  "_name_or_path": "pretrained_models/ft_023/transformer",
   "axes_dim_rope": [
     40,
     40,

transformer_fofpred.py ADDED Viewed

The diff for this file is too large to render. See raw diff

vae/config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "_class_name": "AutoencoderKL",
   "_diffusers_version": "0.34.0",
-  "_name_or_path": "/export/home/public_repo/FOFPred/pretrained_models/hf_upload/vae",
   "act_fn": "silu",
   "block_out_channels": [
     128,

 {
   "_class_name": "AutoencoderKL",
   "_diffusers_version": "0.34.0",
+  "_name_or_path": "/export/home/.cache/huggingface/hub/models--OmniGen2--OmniGen2/snapshots/df5dca8a981d74e6c3af214c145f5c735fe72367/vae",
   "act_fn": "silu",
   "block_out_channels": [
     128,