BiliSakura
/

iMF-diffusers

@@ -1,79 +1,47 @@
-# Copyright 2026 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Hub custom pipeline: IMFPipeline.
 Load with native Hugging Face diffusers and trust_remote_code=True.
 """
 from __future__ import annotations
 import json
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
 import torch
-from diffusers.image_processor import VaeImageProcessor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.utils.torch_utils import randn_tensor
-def _set_imf_timesteps(
-    scheduler: FlowMatchEulerDiscreteScheduler,
-    num_inference_steps: int,
-    device: torch.device,
-) -> torch.Tensor:
-    flow_sigmas = torch.linspace(1.0, 0.0, num_inference_steps + 1, device=device, dtype=torch.float32)
-    scheduler.set_timesteps(sigmas=flow_sigmas.tolist(), device=device)
-    return flow_sigmas
 class IMFPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for ImageNet class-conditional generation with Improved Mean Flows (iMF).
-    Parameters:
-        transformer ([`IMFTransformer2DModel`]):
-            Class-conditioned iMF transformer that predicts mean-flow velocity.
-        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
-            Flow-matching Euler scheduler.
-        vae ([`AutoencoderKL`]):
-            Variational autoencoder used to decode transformer latents to pixels.
-        id2label (`dict[int, str]`, *optional*):
-            ImageNet class id to English label mapping.
-    """
-    model_cpu_offload_seq = "transformer->vae"
     def __init__(
         self,
         transformer,
         scheduler,
-        vae,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         if scheduler is None:
-            scheduler = FlowMatchEulerDiscreteScheduler(
-                num_train_timesteps=1000,
-                shift=1.0,
-                stochastic_sampling=False,
-            )
-        self.register_modules(transformer=transformer, scheduler=scheduler, vae=vae)
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
         self._labels_loaded_from_model_index = bool(self._id2label)
@@ -125,12 +93,13 @@ class IMFPipeline(DiffusionPipeline):
         self._ensure_labels_loaded()
         if not self.labels:
             raise ValueError("No labels loaded. Ensure `id2label` exists in model_index.json.")
-        labels = [label] if isinstance(label, str) else label
-        missing = [item for item in labels if item not in self.labels]
         if missing:
             preview = ", ".join(list(self.labels.keys())[:8])
             raise ValueError(f"Unknown label(s): {missing}. Example valid labels: {preview}, ...")
-        return [self.labels[item] for item in labels]
     def _normalize_class_labels(self, class_labels: Union[int, str, List[Union[int, str]]]) -> List[int]:
         if isinstance(class_labels, int):
@@ -153,16 +122,12 @@ class IMFPipeline(DiffusionPipeline):
         guidance_interval_end: float,
         do_classifier_free_guidance: bool,
     ) -> torch.Tensor:
-        dtype = latents.dtype
-        timestep = timestep.to(device=latents.device, dtype=dtype)
-        time_gap = time_gap.to(device=latents.device, dtype=dtype)
         if do_classifier_free_guidance:
             latents_in = torch.cat([latents, latents], dim=0)
             labels = torch.cat([class_labels, class_null], dim=0)
-            omega = torch.tensor([guidance_scale, 1.0], device=latents.device, dtype=dtype)
-            t_min = torch.tensor([guidance_interval_start, 0.0], device=latents.device, dtype=dtype)
-            t_max = torch.tensor([guidance_interval_end, 1.0], device=latents.device, dtype=dtype)
             batch = latents.shape[0]
             timestep_in = timestep.reshape(1).repeat(2 * batch)
             time_gap_in = time_gap.reshape(1).repeat(2 * batch)
@@ -175,9 +140,9 @@ class IMFPipeline(DiffusionPipeline):
             batch = latents.shape[0]
             timestep_in = timestep.reshape(1).repeat(batch)
             time_gap_in = time_gap.reshape(1).repeat(batch)
-            omega = torch.full((batch,), guidance_scale, device=latents.device, dtype=dtype)
-            t_min = torch.full((batch,), guidance_interval_start, device=latents.device, dtype=dtype)
-            t_max = torch.full((batch,), guidance_interval_end, device=latents.device, dtype=dtype)
         outputs = self.transformer(
             sample=latents_in,
@@ -197,17 +162,6 @@ class IMFPipeline(DiffusionPipeline):
         u_cond, u_uncond = velocity_u.chunk(2, dim=0)
         return u_uncond + guidance_scale * (u_cond - u_uncond)
-    def decode_latents(self, latents: torch.Tensor, output_type: str = "pil"):
-        if output_type == "latent":
-            return latents
-        scaling_factor = self.vae.config.scaling_factor
-        latents = latents.to(device=self.vae.device, dtype=self.vae.dtype)
-        image = self.vae.decode(latents / scaling_factor).sample
-        if output_type == "pt":
-            return image
-        return self.image_processor.postprocess(image, output_type=output_type)
     @torch.inference_mode()
     def __call__(
         self,
@@ -218,7 +172,7 @@ class IMFPipeline(DiffusionPipeline):
         guidance_interval_end: float = 0.9,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.Tensor] = None,
-        output_type: str = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
         if output_type not in {"pil", "np", "pt", "latent"}:
@@ -230,7 +184,7 @@ class IMFPipeline(DiffusionPipeline):
         image_size = int(self.transformer.config.sample_size)
         channels = int(self.transformer.config.in_channels)
-        null_class_val = int(self.transformer.config.num_classes)
         if latents is None:
             latents = randn_tensor(
@@ -244,11 +198,14 @@ class IMFPipeline(DiffusionPipeline):
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
         class_null = torch.full_like(class_labels_t, null_class_val)
-        flow_sigmas = _set_imf_timesteps(self.scheduler, num_inference_steps, latents.device)
         for i in self.progress_bar(range(num_inference_steps)):
-            t = flow_sigmas[i]
-            t_next = flow_sigmas[i + 1]
             time_gap = t - t_next
             velocity_u = self._predict_velocity_u(
                 latents,
@@ -261,9 +218,18 @@ class IMFPipeline(DiffusionPipeline):
                 guidance_interval_end,
                 do_classifier_free_guidance,
             )
-            latents = self.scheduler.step(velocity_u, self.scheduler.timesteps[i], latents).prev_sample
-        images = self.decode_latents(latents, output_type=output_type)
         self.maybe_free_model_hooks()
@@ -271,5 +237,4 @@ class IMFPipeline(DiffusionPipeline):
             return (images,)
         return ImagePipelineOutput(images=images)
-IMFPipelineOutput = ImagePipelineOutput

 """Hub custom pipeline: IMFPipeline.
 Load with native Hugging Face diffusers and trust_remote_code=True.
 """
 from __future__ import annotations
+import inspect
 import json
 from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union, Any
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from diffusers.utils.torch_utils import randn_tensor
 class IMFPipeline(DiffusionPipeline):
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler,
+        generator=None,
+        eta: float | None = None,
+    ):
+        kwargs = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        if eta is not None and "eta" in step_params:
+            kwargs["eta"] = eta
+        return kwargs
+    model_cpu_offload_seq = "transformer"
     def __init__(
         self,
         transformer,
         scheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         if scheduler is None:
+            raise ValueError("IMFPipeline requires a scheduler loaded from the checkpoint.")
+        self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
         self._labels_loaded_from_model_index = bool(self._id2label)
         self._ensure_labels_loaded()
         if not self.labels:
             raise ValueError("No labels loaded. Ensure `id2label` exists in model_index.json.")
+        if isinstance(label, str):
+            label = [label]
+        missing = [item for item in label if item not in self.labels]
         if missing:
             preview = ", ".join(list(self.labels.keys())[:8])
             raise ValueError(f"Unknown label(s): {missing}. Example valid labels: {preview}, ...")
+        return [self.labels[item] for item in label]
     def _normalize_class_labels(self, class_labels: Union[int, str, List[Union[int, str]]]) -> List[int]:
         if isinstance(class_labels, int):
         guidance_interval_end: float,
         do_classifier_free_guidance: bool,
     ) -> torch.Tensor:
         if do_classifier_free_guidance:
             latents_in = torch.cat([latents, latents], dim=0)
             labels = torch.cat([class_labels, class_null], dim=0)
+            omega = torch.tensor([guidance_scale, 1.0], device=latents.device, dtype=latents.dtype)
+            t_min = torch.tensor([guidance_interval_start, 0.0], device=latents.device, dtype=latents.dtype)
+            t_max = torch.tensor([guidance_interval_end, 1.0], device=latents.device, dtype=latents.dtype)
             batch = latents.shape[0]
             timestep_in = timestep.reshape(1).repeat(2 * batch)
             time_gap_in = time_gap.reshape(1).repeat(2 * batch)
             batch = latents.shape[0]
             timestep_in = timestep.reshape(1).repeat(batch)
             time_gap_in = time_gap.reshape(1).repeat(batch)
+            omega = torch.full((batch,), guidance_scale, device=latents.device, dtype=latents.dtype)
+            t_min = torch.full((batch,), guidance_interval_start, device=latents.device, dtype=latents.dtype)
+            t_max = torch.full((batch,), guidance_interval_end, device=latents.device, dtype=latents.dtype)
         outputs = self.transformer(
             sample=latents_in,
         u_cond, u_uncond = velocity_u.chunk(2, dim=0)
         return u_uncond + guidance_scale * (u_cond - u_uncond)
     @torch.inference_mode()
     def __call__(
         self,
         guidance_interval_end: float = 0.9,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
         if output_type not in {"pil", "np", "pt", "latent"}:
         image_size = int(self.transformer.config.sample_size)
         channels = int(self.transformer.config.in_channels)
+        null_class_val = int(getattr(self.transformer.config, "num_classes", 1000))
         if latents is None:
             latents = randn_tensor(
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
         class_null = torch.full_like(class_labels_t, null_class_val)
+        self.scheduler.set_timesteps(num_inference_steps, device=latents.device)
+        timesteps = self.scheduler.timesteps
+        extra_step_kwargs = self.prepare_extra_step_kwargs(self.scheduler, generator=generator)
         for i in self.progress_bar(range(num_inference_steps)):
+            t = timesteps[i]
+            t_next = timesteps[i + 1]
             time_gap = t - t_next
             velocity_u = self._predict_velocity_u(
                 latents,
                 guidance_interval_end,
                 do_classifier_free_guidance,
             )
+            latents = self.scheduler.step(velocity_u, t, latents, **extra_step_kwargs).prev_sample
+        if output_type == "latent":
+            images = latents
+        else:
+            images_pt = latents.float().clamp(-4, 4)
+            if output_type == "pt":
+                images = images_pt
+            elif output_type == "np":
+                images = images_pt.cpu().permute(0, 2, 3, 1).numpy()
+            else:
+                images = self.numpy_to_pil(images_pt.cpu().permute(0, 2, 3, 1).numpy())
         self.maybe_free_model_hooks()
             return (images,)
         return ImagePipelineOutput(images=images)
+IMFPipelineOutput = ImagePipelineOutput