Fix generator determinism: forward generator through scheduler steps and seeded noise

Browse files

Files changed (6) hide show

JiT-B-16/pipeline.py +59 -151
JiT-B-32/pipeline.py +59 -151
JiT-H-16/pipeline.py +59 -151
JiT-H-32/pipeline.py +59 -151
JiT-L-16/pipeline.py +59 -151
JiT-L-32/pipeline.py +59 -151

JiT-B-16/pipeline.py CHANGED Viewed

@@ -1,36 +1,24 @@
-# Copyright 2026 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import importlib
 import json
-import sys
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from diffusers.schedulers import FlowMatchHeunDiscreteScheduler, KarrasDiffusionSchedulers
 from diffusers.utils.torch_utils import randn_tensor
 RECOMMENDED_NOISE_BY_SIZE = {
     256: 1.0,
     512: 2.0,
 }
 class JiTPipeline(DiffusionPipeline):
     r"""
     Pipeline for image generation using JiT (Just image Transformer).
@@ -44,100 +32,43 @@ class JiTPipeline(DiffusionPipeline):
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
-    model_cpu_offload_seq = "transformer"
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
-        """Load a self-contained variant folder locally or from the Hub.
-        Examples:
-            JiTPipeline.from_pretrained(".")
-            JiTPipeline.from_pretrained("./JiT-H-32")
-            DiffusionPipeline.from_pretrained("BiliSakura/JiT-diffusers", subfolder="JiT-H-32", trust_remote_code=True)
-        """
-        repo_root = Path(__file__).resolve().parent
-        if pretrained_model_name_or_path in (None, "", "."):
-            variant = repo_root
-        elif (
-            isinstance(pretrained_model_name_or_path, str)
-            and "/" in pretrained_model_name_or_path
-            and not Path(pretrained_model_name_or_path).exists()
-        ):
-            from huggingface_hub import snapshot_download
-            hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
-            if subfolder:
-                hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
-            cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
-            variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
-        else:
-            variant = Path(pretrained_model_name_or_path)
-            if not variant.is_absolute():
-                candidate = (Path.cwd() / variant).resolve()
-                variant = candidate if candidate.exists() else (repo_root / variant).resolve()
-            if subfolder:
-                variant = variant / subfolder
-        id2label_override = kwargs.pop("id2label", None)
-        model_kwargs = dict(kwargs)
-        inserted: List[str] = []
-        def _load_component(folder: str, module_name: str, class_name: str):
-            comp_dir = variant / folder
-            module_path = comp_dir / f"{module_name}.py"
-            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
-            if not module_path.exists() or not has_weights:
-                return None
-            comp_path = str(comp_dir)
-            if comp_path not in sys.path:
-                sys.path.insert(0, comp_path)
-                inserted.append(comp_path)
-            module = importlib.import_module(module_name)
-            component_cls = getattr(module, class_name)
-            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
-        try:
-            transformer = _load_component("transformer", "jit_transformer_2d", "JiTTransformer2DModel")
-            try:
-                scheduler = FlowMatchHeunDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
-            except Exception:
-                scheduler = FlowMatchHeunDiscreteScheduler(shift=4.0)
-            if transformer is None:
-                raise ValueError(f"No loadable transformer found under {variant}")
-            variant_path = str(variant)
-            model_index_path = variant / "model_index.json"
-            id2label = id2label_override or cls._read_id2label_from_model_index(model_index_path)
-            pipe = cls(
-                transformer=transformer,
-                scheduler=scheduler,
-                id2label=id2label,
-            )
-            if variant_path and hasattr(pipe, "register_to_config"):
-                pipe.register_to_config(_name_or_path=variant_path)
-            return pipe
-        finally:
-            for comp_path in inserted:
-                if comp_path in sys.path:
-                    sys.path.remove(comp_path)
     def __init__(
         self,
         transformer,
-        scheduler: FlowMatchHeunDiscreteScheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         scheduler = scheduler or FlowMatchHeunDiscreteScheduler(shift=4.0)
         self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
     @staticmethod
     def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
@@ -146,7 +77,11 @@ class JiTPipeline(DiffusionPipeline):
         return {int(key): value for key, value in id2label.items()}
     @staticmethod
-    def _read_id2label_from_model_index(model_index_path: Path) -> Dict[int, str]:
         if not model_index_path.exists():
             return {}
         raw = json.loads(model_index_path.read_text(encoding="utf-8"))
@@ -167,20 +102,16 @@ class JiTPipeline(DiffusionPipeline):
     @property
     def id2label(self) -> Dict[int, str]:
-        """ImageNet class id to English label string (comma-separated synonyms)."""
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
-        r"""
-        Map ImageNet label strings to class ids.
-        Args:
-            label (`str` or `list[str]`):
-                One or more English label strings. Each string must match a synonym in `id2label`.
-        """
         label2id = self.labels
         if not label2id:
-            raise ValueError("No English labels loaded. Ensure `id2label` exists in model_index.json.")
         if isinstance(label, str):
             label = [label]
@@ -188,9 +119,7 @@ class JiTPipeline(DiffusionPipeline):
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
-            raise ValueError(
-                f"Unknown English label(s): {missing}. Example valid labels: {preview}, ..."
-            )
         return [label2id[item] for item in label]
     def _normalize_class_labels(
@@ -225,33 +154,10 @@ class JiTPipeline(DiffusionPipeline):
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Generate class-conditional images.
-        Args:
-            class_labels (`int`, `str`, `list[int]`, or `list[str]`):
-                ImageNet class indices or human-readable English label strings.
-            guidance_scale (`float`, *optional*):
-                Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
-            guidance_interval_min (`float`, defaults to `0.1`):
-                Lower bound of the CFG interval in flow time `t in [0, 1]`.
-            guidance_interval_max (`float`, defaults to `1.0`):
-                Upper bound of the CFG interval in flow time.
-            noise_scale (`float`, *optional*):
-                Initial Gaussian noise scale (`1.0` for 256px, `2.0` for 512px by default).
-            t_eps (`float`, defaults to `5e-2`):
-                Epsilon clamp for the `1 - t` denominator, matching JiT source defaults.
-            generator (`torch.Generator`, *optional*):
-                RNG for reproducibility.
-            num_inference_steps (`int`, defaults to `50`):
-                Number of solver steps (at least 2).
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                `"pil"`, `"np"`, or `"pt"`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Return [`ImagePipelineOutput`] if True.
-        """
         if num_inference_steps < 2:
             raise ValueError("num_inference_steps must be >= 2.")
         class_label_ids = self._normalize_class_labels(class_labels)
         do_classifier_free_guidance = guidance_scale is not None and guidance_scale > 1.0
@@ -268,22 +174,21 @@ class JiTPipeline(DiffusionPipeline):
                 f"height and width must be divisible by patch_size={patch_size}. Got {(height, width)}."
             )
         channels = int(self.transformer.config.in_channels)
-        null_class_val = int(self.transformer.config.num_classes)
         if guidance_scale is None:
             guidance_scale = 1.0
         if noise_scale is None:
             noise_scale = RECOMMENDED_NOISE_BY_SIZE.get(max(height, width), 1.0)
-        latents = (
-            randn_tensor(
                 shape=(batch_size, channels, height, width),
-                generator=generator,
-                device=self._execution_device,
-                dtype=self.transformer.dtype,
-            )
-            * noise_scale
-        )
         class_labels_t = torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
@@ -295,6 +200,7 @@ class JiTPipeline(DiffusionPipeline):
             class_labels_input = class_labels_t
         self.scheduler.set_timesteps(num_inference_steps, device=self._execution_device)
         for t in self.progress_bar(self.scheduler.timesteps):
             step_index = self.scheduler.index_for_timestep(t, self.scheduler.timesteps)
             sigma = self.scheduler.sigmas[step_index].to(device=latents.device, dtype=latents.dtype)
@@ -329,7 +235,7 @@ class JiTPipeline(DiffusionPipeline):
             sigma = sigma.reshape(*([1] * (latents.ndim - 1)))
             # JiT predicts x0; scheduler integrates in sigma space: dz/dsigma = -(x0 - z) / sigma.
             model_output = -(x_pred - latents) / sigma
-            latents = self.scheduler.step(model_output, t, latents).prev_sample
         images_pt = ((latents.float().clamp(-1, 1) + 1.0) / 2.0).cpu()
         if output_type == "pt":
@@ -344,3 +250,5 @@ class JiTPipeline(DiffusionPipeline):
         if not return_dict:
             return (images,)
         return ImagePipelineOutput(images=images)

+"""Hub custom pipeline: JiTPipeline.
+Load with native Hugging Face diffusers and trust_remote_code=True.
+"""
+from __future__ import annotations
+import inspect
 import json
 from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union, Any
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from diffusers.utils.torch_utils import randn_tensor
 RECOMMENDED_NOISE_BY_SIZE = {
     256: 1.0,
     512: 2.0,
 }
 class JiTPipeline(DiffusionPipeline):
     r"""
     Pipeline for image generation using JiT (Just image Transformer).
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler,
+        generator=None,
+        eta: float | None = None,
+    ):
+        kwargs = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        if eta is not None and "eta" in step_params:
+            kwargs["eta"] = eta
+        return kwargs
+    model_cpu_offload_seq = "transformer"
     def __init__(
         self,
         transformer,
+        scheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         scheduler = scheduler or FlowMatchHeunDiscreteScheduler(shift=4.0)
         self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = bool(self._id2label)
+    def _ensure_labels_loaded(self) -> None:
+        if self._labels_loaded_from_model_index:
+            return
+        loaded = self._read_id2label_from_model_index(getattr(self.config, "_name_or_path", None))
+        if loaded:
+            self._id2label = loaded
+            self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = True
     @staticmethod
     def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
         return {int(key): value for key, value in id2label.items()}
     @staticmethod
+    def _read_id2label_from_model_index(variant_path: Optional[str]) -> Dict[int, str]:
+        if not variant_path:
+            return {}
+        variant_dir = Path(variant_path).resolve()
+        model_index_path = variant_dir / "model_index.json"
         if not model_index_path.exists():
             return {}
         raw = json.loads(model_index_path.read_text(encoding="utf-8"))
     @property
     def id2label(self) -> Dict[int, str]:
+        self._ensure_labels_loaded()
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        self._ensure_labels_loaded()
         label2id = self.labels
         if not label2id:
+            raise ValueError(
+                "No English labels loaded. Ensure `id2label` exists in model_index.json."
+            )
         if isinstance(label, str):
             label = [label]
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
+            raise ValueError(f"Unknown English label(s): {missing}. Example valid labels: {preview}, ...")
         return [label2id[item] for item in label]
     def _normalize_class_labels(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
         if num_inference_steps < 2:
             raise ValueError("num_inference_steps must be >= 2.")
+        if output_type not in {"pil", "np", "pt"}:
+            raise ValueError("output_type must be one of: 'pil', 'np', 'pt'.")
         class_label_ids = self._normalize_class_labels(class_labels)
         do_classifier_free_guidance = guidance_scale is not None and guidance_scale > 1.0
                 f"height and width must be divisible by patch_size={patch_size}. Got {(height, width)}."
             )
         channels = int(self.transformer.config.in_channels)
+        null_class_val = int(
+            getattr(self.transformer.config, "num_classes", getattr(self.transformer.config, "num_class_embeds", 1000))
+        )
         if guidance_scale is None:
             guidance_scale = 1.0
         if noise_scale is None:
             noise_scale = RECOMMENDED_NOISE_BY_SIZE.get(max(height, width), 1.0)
+        latents = randn_tensor(
                 shape=(batch_size, channels, height, width),
+            generator=generator,
+            device=self._execution_device,
+            dtype=self.transformer.dtype,
+        ) * noise_scale
         class_labels_t = torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
             class_labels_input = class_labels_t
         self.scheduler.set_timesteps(num_inference_steps, device=self._execution_device)
+        extra_step_kwargs = self.prepare_extra_step_kwargs(self.scheduler, generator=generator)
         for t in self.progress_bar(self.scheduler.timesteps):
             step_index = self.scheduler.index_for_timestep(t, self.scheduler.timesteps)
             sigma = self.scheduler.sigmas[step_index].to(device=latents.device, dtype=latents.dtype)
             sigma = sigma.reshape(*([1] * (latents.ndim - 1)))
             # JiT predicts x0; scheduler integrates in sigma space: dz/dsigma = -(x0 - z) / sigma.
             model_output = -(x_pred - latents) / sigma
+            latents = self.scheduler.step(model_output, t, latents, **extra_step_kwargs).prev_sample
         images_pt = ((latents.float().clamp(-1, 1) + 1.0) / 2.0).cpu()
         if output_type == "pt":
         if not return_dict:
             return (images,)
         return ImagePipelineOutput(images=images)
+JiTPipelineOutput = ImagePipelineOutput

JiT-B-32/pipeline.py CHANGED Viewed

@@ -1,36 +1,24 @@
-# Copyright 2026 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import importlib
 import json
-import sys
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from diffusers.schedulers import FlowMatchHeunDiscreteScheduler, KarrasDiffusionSchedulers
 from diffusers.utils.torch_utils import randn_tensor
 RECOMMENDED_NOISE_BY_SIZE = {
     256: 1.0,
     512: 2.0,
 }
 class JiTPipeline(DiffusionPipeline):
     r"""
     Pipeline for image generation using JiT (Just image Transformer).
@@ -44,100 +32,43 @@ class JiTPipeline(DiffusionPipeline):
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
-    model_cpu_offload_seq = "transformer"
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
-        """Load a self-contained variant folder locally or from the Hub.
-        Examples:
-            JiTPipeline.from_pretrained(".")
-            JiTPipeline.from_pretrained("./JiT-H-32")
-            DiffusionPipeline.from_pretrained("BiliSakura/JiT-diffusers", subfolder="JiT-H-32", trust_remote_code=True)
-        """
-        repo_root = Path(__file__).resolve().parent
-        if pretrained_model_name_or_path in (None, "", "."):
-            variant = repo_root
-        elif (
-            isinstance(pretrained_model_name_or_path, str)
-            and "/" in pretrained_model_name_or_path
-            and not Path(pretrained_model_name_or_path).exists()
-        ):
-            from huggingface_hub import snapshot_download
-            hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
-            if subfolder:
-                hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
-            cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
-            variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
-        else:
-            variant = Path(pretrained_model_name_or_path)
-            if not variant.is_absolute():
-                candidate = (Path.cwd() / variant).resolve()
-                variant = candidate if candidate.exists() else (repo_root / variant).resolve()
-            if subfolder:
-                variant = variant / subfolder
-        id2label_override = kwargs.pop("id2label", None)
-        model_kwargs = dict(kwargs)
-        inserted: List[str] = []
-        def _load_component(folder: str, module_name: str, class_name: str):
-            comp_dir = variant / folder
-            module_path = comp_dir / f"{module_name}.py"
-            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
-            if not module_path.exists() or not has_weights:
-                return None
-            comp_path = str(comp_dir)
-            if comp_path not in sys.path:
-                sys.path.insert(0, comp_path)
-                inserted.append(comp_path)
-            module = importlib.import_module(module_name)
-            component_cls = getattr(module, class_name)
-            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
-        try:
-            transformer = _load_component("transformer", "jit_transformer_2d", "JiTTransformer2DModel")
-            try:
-                scheduler = FlowMatchHeunDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
-            except Exception:
-                scheduler = FlowMatchHeunDiscreteScheduler(shift=4.0)
-            if transformer is None:
-                raise ValueError(f"No loadable transformer found under {variant}")
-            variant_path = str(variant)
-            model_index_path = variant / "model_index.json"
-            id2label = id2label_override or cls._read_id2label_from_model_index(model_index_path)
-            pipe = cls(
-                transformer=transformer,
-                scheduler=scheduler,
-                id2label=id2label,
-            )
-            if variant_path and hasattr(pipe, "register_to_config"):
-                pipe.register_to_config(_name_or_path=variant_path)
-            return pipe
-        finally:
-            for comp_path in inserted:
-                if comp_path in sys.path:
-                    sys.path.remove(comp_path)
     def __init__(
         self,
         transformer,
-        scheduler: FlowMatchHeunDiscreteScheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         scheduler = scheduler or FlowMatchHeunDiscreteScheduler(shift=4.0)
         self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
     @staticmethod
     def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
@@ -146,7 +77,11 @@ class JiTPipeline(DiffusionPipeline):
         return {int(key): value for key, value in id2label.items()}
     @staticmethod
-    def _read_id2label_from_model_index(model_index_path: Path) -> Dict[int, str]:
         if not model_index_path.exists():
             return {}
         raw = json.loads(model_index_path.read_text(encoding="utf-8"))
@@ -167,20 +102,16 @@ class JiTPipeline(DiffusionPipeline):
     @property
     def id2label(self) -> Dict[int, str]:
-        """ImageNet class id to English label string (comma-separated synonyms)."""
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
-        r"""
-        Map ImageNet label strings to class ids.
-        Args:
-            label (`str` or `list[str]`):
-                One or more English label strings. Each string must match a synonym in `id2label`.
-        """
         label2id = self.labels
         if not label2id:
-            raise ValueError("No English labels loaded. Ensure `id2label` exists in model_index.json.")
         if isinstance(label, str):
             label = [label]
@@ -188,9 +119,7 @@ class JiTPipeline(DiffusionPipeline):
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
-            raise ValueError(
-                f"Unknown English label(s): {missing}. Example valid labels: {preview}, ..."
-            )
         return [label2id[item] for item in label]
     def _normalize_class_labels(
@@ -225,33 +154,10 @@ class JiTPipeline(DiffusionPipeline):
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Generate class-conditional images.
-        Args:
-            class_labels (`int`, `str`, `list[int]`, or `list[str]`):
-                ImageNet class indices or human-readable English label strings.
-            guidance_scale (`float`, *optional*):
-                Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
-            guidance_interval_min (`float`, defaults to `0.1`):
-                Lower bound of the CFG interval in flow time `t in [0, 1]`.
-            guidance_interval_max (`float`, defaults to `1.0`):
-                Upper bound of the CFG interval in flow time.
-            noise_scale (`float`, *optional*):
-                Initial Gaussian noise scale (`1.0` for 256px, `2.0` for 512px by default).
-            t_eps (`float`, defaults to `5e-2`):
-                Epsilon clamp for the `1 - t` denominator, matching JiT source defaults.
-            generator (`torch.Generator`, *optional*):
-                RNG for reproducibility.
-            num_inference_steps (`int`, defaults to `50`):
-                Number of solver steps (at least 2).
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                `"pil"`, `"np"`, or `"pt"`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Return [`ImagePipelineOutput`] if True.
-        """
         if num_inference_steps < 2:
             raise ValueError("num_inference_steps must be >= 2.")
         class_label_ids = self._normalize_class_labels(class_labels)
         do_classifier_free_guidance = guidance_scale is not None and guidance_scale > 1.0
@@ -268,22 +174,21 @@ class JiTPipeline(DiffusionPipeline):
                 f"height and width must be divisible by patch_size={patch_size}. Got {(height, width)}."
             )
         channels = int(self.transformer.config.in_channels)
-        null_class_val = int(self.transformer.config.num_classes)
         if guidance_scale is None:
             guidance_scale = 1.0
         if noise_scale is None:
             noise_scale = RECOMMENDED_NOISE_BY_SIZE.get(max(height, width), 1.0)
-        latents = (
-            randn_tensor(
                 shape=(batch_size, channels, height, width),
-                generator=generator,
-                device=self._execution_device,
-                dtype=self.transformer.dtype,
-            )
-            * noise_scale
-        )
         class_labels_t = torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
@@ -295,6 +200,7 @@ class JiTPipeline(DiffusionPipeline):
             class_labels_input = class_labels_t
         self.scheduler.set_timesteps(num_inference_steps, device=self._execution_device)
         for t in self.progress_bar(self.scheduler.timesteps):
             step_index = self.scheduler.index_for_timestep(t, self.scheduler.timesteps)
             sigma = self.scheduler.sigmas[step_index].to(device=latents.device, dtype=latents.dtype)
@@ -329,7 +235,7 @@ class JiTPipeline(DiffusionPipeline):
             sigma = sigma.reshape(*([1] * (latents.ndim - 1)))
             # JiT predicts x0; scheduler integrates in sigma space: dz/dsigma = -(x0 - z) / sigma.
             model_output = -(x_pred - latents) / sigma
-            latents = self.scheduler.step(model_output, t, latents).prev_sample
         images_pt = ((latents.float().clamp(-1, 1) + 1.0) / 2.0).cpu()
         if output_type == "pt":
@@ -344,3 +250,5 @@ class JiTPipeline(DiffusionPipeline):
         if not return_dict:
             return (images,)
         return ImagePipelineOutput(images=images)

+"""Hub custom pipeline: JiTPipeline.
+Load with native Hugging Face diffusers and trust_remote_code=True.
+"""
+from __future__ import annotations
+import inspect
 import json
 from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union, Any
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from diffusers.utils.torch_utils import randn_tensor
 RECOMMENDED_NOISE_BY_SIZE = {
     256: 1.0,
     512: 2.0,
 }
 class JiTPipeline(DiffusionPipeline):
     r"""
     Pipeline for image generation using JiT (Just image Transformer).
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler,
+        generator=None,
+        eta: float | None = None,
+    ):
+        kwargs = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        if eta is not None and "eta" in step_params:
+            kwargs["eta"] = eta
+        return kwargs
+    model_cpu_offload_seq = "transformer"
     def __init__(
         self,
         transformer,
+        scheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         scheduler = scheduler or FlowMatchHeunDiscreteScheduler(shift=4.0)
         self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = bool(self._id2label)
+    def _ensure_labels_loaded(self) -> None:
+        if self._labels_loaded_from_model_index:
+            return
+        loaded = self._read_id2label_from_model_index(getattr(self.config, "_name_or_path", None))
+        if loaded:
+            self._id2label = loaded
+            self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = True
     @staticmethod
     def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
         return {int(key): value for key, value in id2label.items()}
     @staticmethod
+    def _read_id2label_from_model_index(variant_path: Optional[str]) -> Dict[int, str]:
+        if not variant_path:
+            return {}
+        variant_dir = Path(variant_path).resolve()
+        model_index_path = variant_dir / "model_index.json"
         if not model_index_path.exists():
             return {}
         raw = json.loads(model_index_path.read_text(encoding="utf-8"))
     @property
     def id2label(self) -> Dict[int, str]:
+        self._ensure_labels_loaded()
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        self._ensure_labels_loaded()
         label2id = self.labels
         if not label2id:
+            raise ValueError(
+                "No English labels loaded. Ensure `id2label` exists in model_index.json."
+            )
         if isinstance(label, str):
             label = [label]
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
+            raise ValueError(f"Unknown English label(s): {missing}. Example valid labels: {preview}, ...")
         return [label2id[item] for item in label]
     def _normalize_class_labels(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
         if num_inference_steps < 2:
             raise ValueError("num_inference_steps must be >= 2.")
+        if output_type not in {"pil", "np", "pt"}:
+            raise ValueError("output_type must be one of: 'pil', 'np', 'pt'.")
         class_label_ids = self._normalize_class_labels(class_labels)
         do_classifier_free_guidance = guidance_scale is not None and guidance_scale > 1.0
                 f"height and width must be divisible by patch_size={patch_size}. Got {(height, width)}."
             )
         channels = int(self.transformer.config.in_channels)
+        null_class_val = int(
+            getattr(self.transformer.config, "num_classes", getattr(self.transformer.config, "num_class_embeds", 1000))
+        )
         if guidance_scale is None:
             guidance_scale = 1.0
         if noise_scale is None:
             noise_scale = RECOMMENDED_NOISE_BY_SIZE.get(max(height, width), 1.0)
+        latents = randn_tensor(
                 shape=(batch_size, channels, height, width),
+            generator=generator,
+            device=self._execution_device,
+            dtype=self.transformer.dtype,
+        ) * noise_scale
         class_labels_t = torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
             class_labels_input = class_labels_t
         self.scheduler.set_timesteps(num_inference_steps, device=self._execution_device)
+        extra_step_kwargs = self.prepare_extra_step_kwargs(self.scheduler, generator=generator)
         for t in self.progress_bar(self.scheduler.timesteps):
             step_index = self.scheduler.index_for_timestep(t, self.scheduler.timesteps)
             sigma = self.scheduler.sigmas[step_index].to(device=latents.device, dtype=latents.dtype)
             sigma = sigma.reshape(*([1] * (latents.ndim - 1)))
             # JiT predicts x0; scheduler integrates in sigma space: dz/dsigma = -(x0 - z) / sigma.
             model_output = -(x_pred - latents) / sigma
+            latents = self.scheduler.step(model_output, t, latents, **extra_step_kwargs).prev_sample
         images_pt = ((latents.float().clamp(-1, 1) + 1.0) / 2.0).cpu()
         if output_type == "pt":
         if not return_dict:
             return (images,)
         return ImagePipelineOutput(images=images)
+JiTPipelineOutput = ImagePipelineOutput

JiT-H-16/pipeline.py CHANGED Viewed

@@ -1,36 +1,24 @@
-# Copyright 2026 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import importlib
 import json
-import sys
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from diffusers.schedulers import FlowMatchHeunDiscreteScheduler, KarrasDiffusionSchedulers
 from diffusers.utils.torch_utils import randn_tensor
 RECOMMENDED_NOISE_BY_SIZE = {
     256: 1.0,
     512: 2.0,
 }
 class JiTPipeline(DiffusionPipeline):
     r"""
     Pipeline for image generation using JiT (Just image Transformer).
@@ -44,100 +32,43 @@ class JiTPipeline(DiffusionPipeline):
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
-    model_cpu_offload_seq = "transformer"
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
-        """Load a self-contained variant folder locally or from the Hub.
-        Examples:
-            JiTPipeline.from_pretrained(".")
-            JiTPipeline.from_pretrained("./JiT-H-32")
-            DiffusionPipeline.from_pretrained("BiliSakura/JiT-diffusers", subfolder="JiT-H-32", trust_remote_code=True)
-        """
-        repo_root = Path(__file__).resolve().parent
-        if pretrained_model_name_or_path in (None, "", "."):
-            variant = repo_root
-        elif (
-            isinstance(pretrained_model_name_or_path, str)
-            and "/" in pretrained_model_name_or_path
-            and not Path(pretrained_model_name_or_path).exists()
-        ):
-            from huggingface_hub import snapshot_download
-            hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
-            if subfolder:
-                hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
-            cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
-            variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
-        else:
-            variant = Path(pretrained_model_name_or_path)
-            if not variant.is_absolute():
-                candidate = (Path.cwd() / variant).resolve()
-                variant = candidate if candidate.exists() else (repo_root / variant).resolve()
-            if subfolder:
-                variant = variant / subfolder
-        id2label_override = kwargs.pop("id2label", None)
-        model_kwargs = dict(kwargs)
-        inserted: List[str] = []
-        def _load_component(folder: str, module_name: str, class_name: str):
-            comp_dir = variant / folder
-            module_path = comp_dir / f"{module_name}.py"
-            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
-            if not module_path.exists() or not has_weights:
-                return None
-            comp_path = str(comp_dir)
-            if comp_path not in sys.path:
-                sys.path.insert(0, comp_path)
-                inserted.append(comp_path)
-            module = importlib.import_module(module_name)
-            component_cls = getattr(module, class_name)
-            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
-        try:
-            transformer = _load_component("transformer", "jit_transformer_2d", "JiTTransformer2DModel")
-            try:
-                scheduler = FlowMatchHeunDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
-            except Exception:
-                scheduler = FlowMatchHeunDiscreteScheduler(shift=4.0)
-            if transformer is None:
-                raise ValueError(f"No loadable transformer found under {variant}")
-            variant_path = str(variant)
-            model_index_path = variant / "model_index.json"
-            id2label = id2label_override or cls._read_id2label_from_model_index(model_index_path)
-            pipe = cls(
-                transformer=transformer,
-                scheduler=scheduler,
-                id2label=id2label,
-            )
-            if variant_path and hasattr(pipe, "register_to_config"):
-                pipe.register_to_config(_name_or_path=variant_path)
-            return pipe
-        finally:
-            for comp_path in inserted:
-                if comp_path in sys.path:
-                    sys.path.remove(comp_path)
     def __init__(
         self,
         transformer,
-        scheduler: FlowMatchHeunDiscreteScheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         scheduler = scheduler or FlowMatchHeunDiscreteScheduler(shift=4.0)
         self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
     @staticmethod
     def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
@@ -146,7 +77,11 @@ class JiTPipeline(DiffusionPipeline):
         return {int(key): value for key, value in id2label.items()}
     @staticmethod
-    def _read_id2label_from_model_index(model_index_path: Path) -> Dict[int, str]:
         if not model_index_path.exists():
             return {}
         raw = json.loads(model_index_path.read_text(encoding="utf-8"))
@@ -167,20 +102,16 @@ class JiTPipeline(DiffusionPipeline):
     @property
     def id2label(self) -> Dict[int, str]:
-        """ImageNet class id to English label string (comma-separated synonyms)."""
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
-        r"""
-        Map ImageNet label strings to class ids.
-        Args:
-            label (`str` or `list[str]`):
-                One or more English label strings. Each string must match a synonym in `id2label`.
-        """
         label2id = self.labels
         if not label2id:
-            raise ValueError("No English labels loaded. Ensure `id2label` exists in model_index.json.")
         if isinstance(label, str):
             label = [label]
@@ -188,9 +119,7 @@ class JiTPipeline(DiffusionPipeline):
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
-            raise ValueError(
-                f"Unknown English label(s): {missing}. Example valid labels: {preview}, ..."
-            )
         return [label2id[item] for item in label]
     def _normalize_class_labels(
@@ -225,33 +154,10 @@ class JiTPipeline(DiffusionPipeline):
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Generate class-conditional images.
-        Args:
-            class_labels (`int`, `str`, `list[int]`, or `list[str]`):
-                ImageNet class indices or human-readable English label strings.
-            guidance_scale (`float`, *optional*):
-                Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
-            guidance_interval_min (`float`, defaults to `0.1`):
-                Lower bound of the CFG interval in flow time `t in [0, 1]`.
-            guidance_interval_max (`float`, defaults to `1.0`):
-                Upper bound of the CFG interval in flow time.
-            noise_scale (`float`, *optional*):
-                Initial Gaussian noise scale (`1.0` for 256px, `2.0` for 512px by default).
-            t_eps (`float`, defaults to `5e-2`):
-                Epsilon clamp for the `1 - t` denominator, matching JiT source defaults.
-            generator (`torch.Generator`, *optional*):
-                RNG for reproducibility.
-            num_inference_steps (`int`, defaults to `50`):
-                Number of solver steps (at least 2).
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                `"pil"`, `"np"`, or `"pt"`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Return [`ImagePipelineOutput`] if True.
-        """
         if num_inference_steps < 2:
             raise ValueError("num_inference_steps must be >= 2.")
         class_label_ids = self._normalize_class_labels(class_labels)
         do_classifier_free_guidance = guidance_scale is not None and guidance_scale > 1.0
@@ -268,22 +174,21 @@ class JiTPipeline(DiffusionPipeline):
                 f"height and width must be divisible by patch_size={patch_size}. Got {(height, width)}."
             )
         channels = int(self.transformer.config.in_channels)
-        null_class_val = int(self.transformer.config.num_classes)
         if guidance_scale is None:
             guidance_scale = 1.0
         if noise_scale is None:
             noise_scale = RECOMMENDED_NOISE_BY_SIZE.get(max(height, width), 1.0)
-        latents = (
-            randn_tensor(
                 shape=(batch_size, channels, height, width),
-                generator=generator,
-                device=self._execution_device,
-                dtype=self.transformer.dtype,
-            )
-            * noise_scale
-        )
         class_labels_t = torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
@@ -295,6 +200,7 @@ class JiTPipeline(DiffusionPipeline):
             class_labels_input = class_labels_t
         self.scheduler.set_timesteps(num_inference_steps, device=self._execution_device)
         for t in self.progress_bar(self.scheduler.timesteps):
             step_index = self.scheduler.index_for_timestep(t, self.scheduler.timesteps)
             sigma = self.scheduler.sigmas[step_index].to(device=latents.device, dtype=latents.dtype)
@@ -329,7 +235,7 @@ class JiTPipeline(DiffusionPipeline):
             sigma = sigma.reshape(*([1] * (latents.ndim - 1)))
             # JiT predicts x0; scheduler integrates in sigma space: dz/dsigma = -(x0 - z) / sigma.
             model_output = -(x_pred - latents) / sigma
-            latents = self.scheduler.step(model_output, t, latents).prev_sample
         images_pt = ((latents.float().clamp(-1, 1) + 1.0) / 2.0).cpu()
         if output_type == "pt":
@@ -344,3 +250,5 @@ class JiTPipeline(DiffusionPipeline):
         if not return_dict:
             return (images,)
         return ImagePipelineOutput(images=images)

+"""Hub custom pipeline: JiTPipeline.
+Load with native Hugging Face diffusers and trust_remote_code=True.
+"""
+from __future__ import annotations
+import inspect
 import json
 from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union, Any
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from diffusers.utils.torch_utils import randn_tensor
 RECOMMENDED_NOISE_BY_SIZE = {
     256: 1.0,
     512: 2.0,
 }
 class JiTPipeline(DiffusionPipeline):
     r"""
     Pipeline for image generation using JiT (Just image Transformer).
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler,
+        generator=None,
+        eta: float | None = None,
+    ):
+        kwargs = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        if eta is not None and "eta" in step_params:
+            kwargs["eta"] = eta
+        return kwargs
+    model_cpu_offload_seq = "transformer"
     def __init__(
         self,
         transformer,
+        scheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         scheduler = scheduler or FlowMatchHeunDiscreteScheduler(shift=4.0)
         self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = bool(self._id2label)
+    def _ensure_labels_loaded(self) -> None:
+        if self._labels_loaded_from_model_index:
+            return
+        loaded = self._read_id2label_from_model_index(getattr(self.config, "_name_or_path", None))
+        if loaded:
+            self._id2label = loaded
+            self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = True
     @staticmethod
     def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
         return {int(key): value for key, value in id2label.items()}
     @staticmethod
+    def _read_id2label_from_model_index(variant_path: Optional[str]) -> Dict[int, str]:
+        if not variant_path:
+            return {}
+        variant_dir = Path(variant_path).resolve()
+        model_index_path = variant_dir / "model_index.json"
         if not model_index_path.exists():
             return {}
         raw = json.loads(model_index_path.read_text(encoding="utf-8"))
     @property
     def id2label(self) -> Dict[int, str]:
+        self._ensure_labels_loaded()
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        self._ensure_labels_loaded()
         label2id = self.labels
         if not label2id:
+            raise ValueError(
+                "No English labels loaded. Ensure `id2label` exists in model_index.json."
+            )
         if isinstance(label, str):
             label = [label]
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
+            raise ValueError(f"Unknown English label(s): {missing}. Example valid labels: {preview}, ...")
         return [label2id[item] for item in label]
     def _normalize_class_labels(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
         if num_inference_steps < 2:
             raise ValueError("num_inference_steps must be >= 2.")
+        if output_type not in {"pil", "np", "pt"}:
+            raise ValueError("output_type must be one of: 'pil', 'np', 'pt'.")
         class_label_ids = self._normalize_class_labels(class_labels)
         do_classifier_free_guidance = guidance_scale is not None and guidance_scale > 1.0
                 f"height and width must be divisible by patch_size={patch_size}. Got {(height, width)}."
             )
         channels = int(self.transformer.config.in_channels)
+        null_class_val = int(
+            getattr(self.transformer.config, "num_classes", getattr(self.transformer.config, "num_class_embeds", 1000))
+        )
         if guidance_scale is None:
             guidance_scale = 1.0
         if noise_scale is None:
             noise_scale = RECOMMENDED_NOISE_BY_SIZE.get(max(height, width), 1.0)
+        latents = randn_tensor(
                 shape=(batch_size, channels, height, width),
+            generator=generator,
+            device=self._execution_device,
+            dtype=self.transformer.dtype,
+        ) * noise_scale
         class_labels_t = torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
             class_labels_input = class_labels_t
         self.scheduler.set_timesteps(num_inference_steps, device=self._execution_device)
+        extra_step_kwargs = self.prepare_extra_step_kwargs(self.scheduler, generator=generator)
         for t in self.progress_bar(self.scheduler.timesteps):
             step_index = self.scheduler.index_for_timestep(t, self.scheduler.timesteps)
             sigma = self.scheduler.sigmas[step_index].to(device=latents.device, dtype=latents.dtype)
             sigma = sigma.reshape(*([1] * (latents.ndim - 1)))
             # JiT predicts x0; scheduler integrates in sigma space: dz/dsigma = -(x0 - z) / sigma.
             model_output = -(x_pred - latents) / sigma
+            latents = self.scheduler.step(model_output, t, latents, **extra_step_kwargs).prev_sample
         images_pt = ((latents.float().clamp(-1, 1) + 1.0) / 2.0).cpu()
         if output_type == "pt":
         if not return_dict:
             return (images,)
         return ImagePipelineOutput(images=images)
+JiTPipelineOutput = ImagePipelineOutput

JiT-H-32/pipeline.py CHANGED Viewed

@@ -1,36 +1,24 @@
-# Copyright 2026 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import importlib
 import json
-import sys
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from diffusers.schedulers import FlowMatchHeunDiscreteScheduler, KarrasDiffusionSchedulers
 from diffusers.utils.torch_utils import randn_tensor
 RECOMMENDED_NOISE_BY_SIZE = {
     256: 1.0,
     512: 2.0,
 }
 class JiTPipeline(DiffusionPipeline):
     r"""
     Pipeline for image generation using JiT (Just image Transformer).
@@ -44,100 +32,43 @@ class JiTPipeline(DiffusionPipeline):
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
-    model_cpu_offload_seq = "transformer"
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
-        """Load a self-contained variant folder locally or from the Hub.
-        Examples:
-            JiTPipeline.from_pretrained(".")
-            JiTPipeline.from_pretrained("./JiT-H-32")
-            DiffusionPipeline.from_pretrained("BiliSakura/JiT-diffusers", subfolder="JiT-H-32", trust_remote_code=True)
-        """
-        repo_root = Path(__file__).resolve().parent
-        if pretrained_model_name_or_path in (None, "", "."):
-            variant = repo_root
-        elif (
-            isinstance(pretrained_model_name_or_path, str)
-            and "/" in pretrained_model_name_or_path
-            and not Path(pretrained_model_name_or_path).exists()
-        ):
-            from huggingface_hub import snapshot_download
-            hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
-            if subfolder:
-                hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
-            cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
-            variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
-        else:
-            variant = Path(pretrained_model_name_or_path)
-            if not variant.is_absolute():
-                candidate = (Path.cwd() / variant).resolve()
-                variant = candidate if candidate.exists() else (repo_root / variant).resolve()
-            if subfolder:
-                variant = variant / subfolder
-        id2label_override = kwargs.pop("id2label", None)
-        model_kwargs = dict(kwargs)
-        inserted: List[str] = []
-        def _load_component(folder: str, module_name: str, class_name: str):
-            comp_dir = variant / folder
-            module_path = comp_dir / f"{module_name}.py"
-            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
-            if not module_path.exists() or not has_weights:
-                return None
-            comp_path = str(comp_dir)
-            if comp_path not in sys.path:
-                sys.path.insert(0, comp_path)
-                inserted.append(comp_path)
-            module = importlib.import_module(module_name)
-            component_cls = getattr(module, class_name)
-            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
-        try:
-            transformer = _load_component("transformer", "jit_transformer_2d", "JiTTransformer2DModel")
-            try:
-                scheduler = FlowMatchHeunDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
-            except Exception:
-                scheduler = FlowMatchHeunDiscreteScheduler(shift=4.0)
-            if transformer is None:
-                raise ValueError(f"No loadable transformer found under {variant}")
-            variant_path = str(variant)
-            model_index_path = variant / "model_index.json"
-            id2label = id2label_override or cls._read_id2label_from_model_index(model_index_path)
-            pipe = cls(
-                transformer=transformer,
-                scheduler=scheduler,
-                id2label=id2label,
-            )
-            if variant_path and hasattr(pipe, "register_to_config"):
-                pipe.register_to_config(_name_or_path=variant_path)
-            return pipe
-        finally:
-            for comp_path in inserted:
-                if comp_path in sys.path:
-                    sys.path.remove(comp_path)
     def __init__(
         self,
         transformer,
-        scheduler: FlowMatchHeunDiscreteScheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         scheduler = scheduler or FlowMatchHeunDiscreteScheduler(shift=4.0)
         self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
     @staticmethod
     def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
@@ -146,7 +77,11 @@ class JiTPipeline(DiffusionPipeline):
         return {int(key): value for key, value in id2label.items()}
     @staticmethod
-    def _read_id2label_from_model_index(model_index_path: Path) -> Dict[int, str]:
         if not model_index_path.exists():
             return {}
         raw = json.loads(model_index_path.read_text(encoding="utf-8"))
@@ -167,20 +102,16 @@ class JiTPipeline(DiffusionPipeline):
     @property
     def id2label(self) -> Dict[int, str]:
-        """ImageNet class id to English label string (comma-separated synonyms)."""
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
-        r"""
-        Map ImageNet label strings to class ids.
-        Args:
-            label (`str` or `list[str]`):
-                One or more English label strings. Each string must match a synonym in `id2label`.
-        """
         label2id = self.labels
         if not label2id:
-            raise ValueError("No English labels loaded. Ensure `id2label` exists in model_index.json.")
         if isinstance(label, str):
             label = [label]
@@ -188,9 +119,7 @@ class JiTPipeline(DiffusionPipeline):
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
-            raise ValueError(
-                f"Unknown English label(s): {missing}. Example valid labels: {preview}, ..."
-            )
         return [label2id[item] for item in label]
     def _normalize_class_labels(
@@ -225,33 +154,10 @@ class JiTPipeline(DiffusionPipeline):
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Generate class-conditional images.
-        Args:
-            class_labels (`int`, `str`, `list[int]`, or `list[str]`):
-                ImageNet class indices or human-readable English label strings.
-            guidance_scale (`float`, *optional*):
-                Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
-            guidance_interval_min (`float`, defaults to `0.1`):
-                Lower bound of the CFG interval in flow time `t in [0, 1]`.
-            guidance_interval_max (`float`, defaults to `1.0`):
-                Upper bound of the CFG interval in flow time.
-            noise_scale (`float`, *optional*):
-                Initial Gaussian noise scale (`1.0` for 256px, `2.0` for 512px by default).
-            t_eps (`float`, defaults to `5e-2`):
-                Epsilon clamp for the `1 - t` denominator, matching JiT source defaults.
-            generator (`torch.Generator`, *optional*):
-                RNG for reproducibility.
-            num_inference_steps (`int`, defaults to `50`):
-                Number of solver steps (at least 2).
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                `"pil"`, `"np"`, or `"pt"`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Return [`ImagePipelineOutput`] if True.
-        """
         if num_inference_steps < 2:
             raise ValueError("num_inference_steps must be >= 2.")
         class_label_ids = self._normalize_class_labels(class_labels)
         do_classifier_free_guidance = guidance_scale is not None and guidance_scale > 1.0
@@ -268,22 +174,21 @@ class JiTPipeline(DiffusionPipeline):
                 f"height and width must be divisible by patch_size={patch_size}. Got {(height, width)}."
             )
         channels = int(self.transformer.config.in_channels)
-        null_class_val = int(self.transformer.config.num_classes)
         if guidance_scale is None:
             guidance_scale = 1.0
         if noise_scale is None:
             noise_scale = RECOMMENDED_NOISE_BY_SIZE.get(max(height, width), 1.0)
-        latents = (
-            randn_tensor(
                 shape=(batch_size, channels, height, width),
-                generator=generator,
-                device=self._execution_device,
-                dtype=self.transformer.dtype,
-            )
-            * noise_scale
-        )
         class_labels_t = torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
@@ -295,6 +200,7 @@ class JiTPipeline(DiffusionPipeline):
             class_labels_input = class_labels_t
         self.scheduler.set_timesteps(num_inference_steps, device=self._execution_device)
         for t in self.progress_bar(self.scheduler.timesteps):
             step_index = self.scheduler.index_for_timestep(t, self.scheduler.timesteps)
             sigma = self.scheduler.sigmas[step_index].to(device=latents.device, dtype=latents.dtype)
@@ -329,7 +235,7 @@ class JiTPipeline(DiffusionPipeline):
             sigma = sigma.reshape(*([1] * (latents.ndim - 1)))
             # JiT predicts x0; scheduler integrates in sigma space: dz/dsigma = -(x0 - z) / sigma.
             model_output = -(x_pred - latents) / sigma
-            latents = self.scheduler.step(model_output, t, latents).prev_sample
         images_pt = ((latents.float().clamp(-1, 1) + 1.0) / 2.0).cpu()
         if output_type == "pt":
@@ -344,3 +250,5 @@ class JiTPipeline(DiffusionPipeline):
         if not return_dict:
             return (images,)
         return ImagePipelineOutput(images=images)

+"""Hub custom pipeline: JiTPipeline.
+Load with native Hugging Face diffusers and trust_remote_code=True.
+"""
+from __future__ import annotations
+import inspect
 import json
 from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union, Any
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from diffusers.utils.torch_utils import randn_tensor
 RECOMMENDED_NOISE_BY_SIZE = {
     256: 1.0,
     512: 2.0,
 }
 class JiTPipeline(DiffusionPipeline):
     r"""
     Pipeline for image generation using JiT (Just image Transformer).
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler,
+        generator=None,
+        eta: float | None = None,
+    ):
+        kwargs = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        if eta is not None and "eta" in step_params:
+            kwargs["eta"] = eta
+        return kwargs
+    model_cpu_offload_seq = "transformer"
     def __init__(
         self,
         transformer,
+        scheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         scheduler = scheduler or FlowMatchHeunDiscreteScheduler(shift=4.0)
         self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = bool(self._id2label)
+    def _ensure_labels_loaded(self) -> None:
+        if self._labels_loaded_from_model_index:
+            return
+        loaded = self._read_id2label_from_model_index(getattr(self.config, "_name_or_path", None))
+        if loaded:
+            self._id2label = loaded
+            self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = True
     @staticmethod
     def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
         return {int(key): value for key, value in id2label.items()}
     @staticmethod
+    def _read_id2label_from_model_index(variant_path: Optional[str]) -> Dict[int, str]:
+        if not variant_path:
+            return {}
+        variant_dir = Path(variant_path).resolve()
+        model_index_path = variant_dir / "model_index.json"
         if not model_index_path.exists():
             return {}
         raw = json.loads(model_index_path.read_text(encoding="utf-8"))
     @property
     def id2label(self) -> Dict[int, str]:
+        self._ensure_labels_loaded()
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        self._ensure_labels_loaded()
         label2id = self.labels
         if not label2id:
+            raise ValueError(
+                "No English labels loaded. Ensure `id2label` exists in model_index.json."
+            )
         if isinstance(label, str):
             label = [label]
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
+            raise ValueError(f"Unknown English label(s): {missing}. Example valid labels: {preview}, ...")
         return [label2id[item] for item in label]
     def _normalize_class_labels(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
         if num_inference_steps < 2:
             raise ValueError("num_inference_steps must be >= 2.")
+        if output_type not in {"pil", "np", "pt"}:
+            raise ValueError("output_type must be one of: 'pil', 'np', 'pt'.")
         class_label_ids = self._normalize_class_labels(class_labels)
         do_classifier_free_guidance = guidance_scale is not None and guidance_scale > 1.0
                 f"height and width must be divisible by patch_size={patch_size}. Got {(height, width)}."
             )
         channels = int(self.transformer.config.in_channels)
+        null_class_val = int(
+            getattr(self.transformer.config, "num_classes", getattr(self.transformer.config, "num_class_embeds", 1000))
+        )
         if guidance_scale is None:
             guidance_scale = 1.0
         if noise_scale is None:
             noise_scale = RECOMMENDED_NOISE_BY_SIZE.get(max(height, width), 1.0)
+        latents = randn_tensor(
                 shape=(batch_size, channels, height, width),
+            generator=generator,
+            device=self._execution_device,
+            dtype=self.transformer.dtype,
+        ) * noise_scale
         class_labels_t = torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
             class_labels_input = class_labels_t
         self.scheduler.set_timesteps(num_inference_steps, device=self._execution_device)
+        extra_step_kwargs = self.prepare_extra_step_kwargs(self.scheduler, generator=generator)
         for t in self.progress_bar(self.scheduler.timesteps):
             step_index = self.scheduler.index_for_timestep(t, self.scheduler.timesteps)
             sigma = self.scheduler.sigmas[step_index].to(device=latents.device, dtype=latents.dtype)
             sigma = sigma.reshape(*([1] * (latents.ndim - 1)))
             # JiT predicts x0; scheduler integrates in sigma space: dz/dsigma = -(x0 - z) / sigma.
             model_output = -(x_pred - latents) / sigma
+            latents = self.scheduler.step(model_output, t, latents, **extra_step_kwargs).prev_sample
         images_pt = ((latents.float().clamp(-1, 1) + 1.0) / 2.0).cpu()
         if output_type == "pt":
         if not return_dict:
             return (images,)
         return ImagePipelineOutput(images=images)
+JiTPipelineOutput = ImagePipelineOutput

JiT-L-16/pipeline.py CHANGED Viewed

@@ -1,36 +1,24 @@
-# Copyright 2026 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import importlib
 import json
-import sys
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from diffusers.schedulers import FlowMatchHeunDiscreteScheduler, KarrasDiffusionSchedulers
 from diffusers.utils.torch_utils import randn_tensor
 RECOMMENDED_NOISE_BY_SIZE = {
     256: 1.0,
     512: 2.0,
 }
 class JiTPipeline(DiffusionPipeline):
     r"""
     Pipeline for image generation using JiT (Just image Transformer).
@@ -44,100 +32,43 @@ class JiTPipeline(DiffusionPipeline):
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
-    model_cpu_offload_seq = "transformer"
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
-        """Load a self-contained variant folder locally or from the Hub.
-        Examples:
-            JiTPipeline.from_pretrained(".")
-            JiTPipeline.from_pretrained("./JiT-H-32")
-            DiffusionPipeline.from_pretrained("BiliSakura/JiT-diffusers", subfolder="JiT-H-32", trust_remote_code=True)
-        """
-        repo_root = Path(__file__).resolve().parent
-        if pretrained_model_name_or_path in (None, "", "."):
-            variant = repo_root
-        elif (
-            isinstance(pretrained_model_name_or_path, str)
-            and "/" in pretrained_model_name_or_path
-            and not Path(pretrained_model_name_or_path).exists()
-        ):
-            from huggingface_hub import snapshot_download
-            hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
-            if subfolder:
-                hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
-            cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
-            variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
-        else:
-            variant = Path(pretrained_model_name_or_path)
-            if not variant.is_absolute():
-                candidate = (Path.cwd() / variant).resolve()
-                variant = candidate if candidate.exists() else (repo_root / variant).resolve()
-            if subfolder:
-                variant = variant / subfolder
-        id2label_override = kwargs.pop("id2label", None)
-        model_kwargs = dict(kwargs)
-        inserted: List[str] = []
-        def _load_component(folder: str, module_name: str, class_name: str):
-            comp_dir = variant / folder
-            module_path = comp_dir / f"{module_name}.py"
-            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
-            if not module_path.exists() or not has_weights:
-                return None
-            comp_path = str(comp_dir)
-            if comp_path not in sys.path:
-                sys.path.insert(0, comp_path)
-                inserted.append(comp_path)
-            module = importlib.import_module(module_name)
-            component_cls = getattr(module, class_name)
-            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
-        try:
-            transformer = _load_component("transformer", "jit_transformer_2d", "JiTTransformer2DModel")
-            try:
-                scheduler = FlowMatchHeunDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
-            except Exception:
-                scheduler = FlowMatchHeunDiscreteScheduler(shift=4.0)
-            if transformer is None:
-                raise ValueError(f"No loadable transformer found under {variant}")
-            variant_path = str(variant)
-            model_index_path = variant / "model_index.json"
-            id2label = id2label_override or cls._read_id2label_from_model_index(model_index_path)
-            pipe = cls(
-                transformer=transformer,
-                scheduler=scheduler,
-                id2label=id2label,
-            )
-            if variant_path and hasattr(pipe, "register_to_config"):
-                pipe.register_to_config(_name_or_path=variant_path)
-            return pipe
-        finally:
-            for comp_path in inserted:
-                if comp_path in sys.path:
-                    sys.path.remove(comp_path)
     def __init__(
         self,
         transformer,
-        scheduler: FlowMatchHeunDiscreteScheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         scheduler = scheduler or FlowMatchHeunDiscreteScheduler(shift=4.0)
         self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
     @staticmethod
     def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
@@ -146,7 +77,11 @@ class JiTPipeline(DiffusionPipeline):
         return {int(key): value for key, value in id2label.items()}
     @staticmethod
-    def _read_id2label_from_model_index(model_index_path: Path) -> Dict[int, str]:
         if not model_index_path.exists():
             return {}
         raw = json.loads(model_index_path.read_text(encoding="utf-8"))
@@ -167,20 +102,16 @@ class JiTPipeline(DiffusionPipeline):
     @property
     def id2label(self) -> Dict[int, str]:
-        """ImageNet class id to English label string (comma-separated synonyms)."""
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
-        r"""
-        Map ImageNet label strings to class ids.
-        Args:
-            label (`str` or `list[str]`):
-                One or more English label strings. Each string must match a synonym in `id2label`.
-        """
         label2id = self.labels
         if not label2id:
-            raise ValueError("No English labels loaded. Ensure `id2label` exists in model_index.json.")
         if isinstance(label, str):
             label = [label]
@@ -188,9 +119,7 @@ class JiTPipeline(DiffusionPipeline):
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
-            raise ValueError(
-                f"Unknown English label(s): {missing}. Example valid labels: {preview}, ..."
-            )
         return [label2id[item] for item in label]
     def _normalize_class_labels(
@@ -225,33 +154,10 @@ class JiTPipeline(DiffusionPipeline):
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Generate class-conditional images.
-        Args:
-            class_labels (`int`, `str`, `list[int]`, or `list[str]`):
-                ImageNet class indices or human-readable English label strings.
-            guidance_scale (`float`, *optional*):
-                Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
-            guidance_interval_min (`float`, defaults to `0.1`):
-                Lower bound of the CFG interval in flow time `t in [0, 1]`.
-            guidance_interval_max (`float`, defaults to `1.0`):
-                Upper bound of the CFG interval in flow time.
-            noise_scale (`float`, *optional*):
-                Initial Gaussian noise scale (`1.0` for 256px, `2.0` for 512px by default).
-            t_eps (`float`, defaults to `5e-2`):
-                Epsilon clamp for the `1 - t` denominator, matching JiT source defaults.
-            generator (`torch.Generator`, *optional*):
-                RNG for reproducibility.
-            num_inference_steps (`int`, defaults to `50`):
-                Number of solver steps (at least 2).
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                `"pil"`, `"np"`, or `"pt"`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Return [`ImagePipelineOutput`] if True.
-        """
         if num_inference_steps < 2:
             raise ValueError("num_inference_steps must be >= 2.")
         class_label_ids = self._normalize_class_labels(class_labels)
         do_classifier_free_guidance = guidance_scale is not None and guidance_scale > 1.0
@@ -268,22 +174,21 @@ class JiTPipeline(DiffusionPipeline):
                 f"height and width must be divisible by patch_size={patch_size}. Got {(height, width)}."
             )
         channels = int(self.transformer.config.in_channels)
-        null_class_val = int(self.transformer.config.num_classes)
         if guidance_scale is None:
             guidance_scale = 1.0
         if noise_scale is None:
             noise_scale = RECOMMENDED_NOISE_BY_SIZE.get(max(height, width), 1.0)
-        latents = (
-            randn_tensor(
                 shape=(batch_size, channels, height, width),
-                generator=generator,
-                device=self._execution_device,
-                dtype=self.transformer.dtype,
-            )
-            * noise_scale
-        )
         class_labels_t = torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
@@ -295,6 +200,7 @@ class JiTPipeline(DiffusionPipeline):
             class_labels_input = class_labels_t
         self.scheduler.set_timesteps(num_inference_steps, device=self._execution_device)
         for t in self.progress_bar(self.scheduler.timesteps):
             step_index = self.scheduler.index_for_timestep(t, self.scheduler.timesteps)
             sigma = self.scheduler.sigmas[step_index].to(device=latents.device, dtype=latents.dtype)
@@ -329,7 +235,7 @@ class JiTPipeline(DiffusionPipeline):
             sigma = sigma.reshape(*([1] * (latents.ndim - 1)))
             # JiT predicts x0; scheduler integrates in sigma space: dz/dsigma = -(x0 - z) / sigma.
             model_output = -(x_pred - latents) / sigma
-            latents = self.scheduler.step(model_output, t, latents).prev_sample
         images_pt = ((latents.float().clamp(-1, 1) + 1.0) / 2.0).cpu()
         if output_type == "pt":
@@ -344,3 +250,5 @@ class JiTPipeline(DiffusionPipeline):
         if not return_dict:
             return (images,)
         return ImagePipelineOutput(images=images)

+"""Hub custom pipeline: JiTPipeline.
+Load with native Hugging Face diffusers and trust_remote_code=True.
+"""
+from __future__ import annotations
+import inspect
 import json
 from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union, Any
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from diffusers.utils.torch_utils import randn_tensor
 RECOMMENDED_NOISE_BY_SIZE = {
     256: 1.0,
     512: 2.0,
 }
 class JiTPipeline(DiffusionPipeline):
     r"""
     Pipeline for image generation using JiT (Just image Transformer).
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler,
+        generator=None,
+        eta: float | None = None,
+    ):
+        kwargs = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        if eta is not None and "eta" in step_params:
+            kwargs["eta"] = eta
+        return kwargs
+    model_cpu_offload_seq = "transformer"
     def __init__(
         self,
         transformer,
+        scheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         scheduler = scheduler or FlowMatchHeunDiscreteScheduler(shift=4.0)
         self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = bool(self._id2label)
+    def _ensure_labels_loaded(self) -> None:
+        if self._labels_loaded_from_model_index:
+            return
+        loaded = self._read_id2label_from_model_index(getattr(self.config, "_name_or_path", None))
+        if loaded:
+            self._id2label = loaded
+            self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = True
     @staticmethod
     def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
         return {int(key): value for key, value in id2label.items()}
     @staticmethod
+    def _read_id2label_from_model_index(variant_path: Optional[str]) -> Dict[int, str]:
+        if not variant_path:
+            return {}
+        variant_dir = Path(variant_path).resolve()
+        model_index_path = variant_dir / "model_index.json"
         if not model_index_path.exists():
             return {}
         raw = json.loads(model_index_path.read_text(encoding="utf-8"))
     @property
     def id2label(self) -> Dict[int, str]:
+        self._ensure_labels_loaded()
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        self._ensure_labels_loaded()
         label2id = self.labels
         if not label2id:
+            raise ValueError(
+                "No English labels loaded. Ensure `id2label` exists in model_index.json."
+            )
         if isinstance(label, str):
             label = [label]
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
+            raise ValueError(f"Unknown English label(s): {missing}. Example valid labels: {preview}, ...")
         return [label2id[item] for item in label]
     def _normalize_class_labels(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
         if num_inference_steps < 2:
             raise ValueError("num_inference_steps must be >= 2.")
+        if output_type not in {"pil", "np", "pt"}:
+            raise ValueError("output_type must be one of: 'pil', 'np', 'pt'.")
         class_label_ids = self._normalize_class_labels(class_labels)
         do_classifier_free_guidance = guidance_scale is not None and guidance_scale > 1.0
                 f"height and width must be divisible by patch_size={patch_size}. Got {(height, width)}."
             )
         channels = int(self.transformer.config.in_channels)
+        null_class_val = int(
+            getattr(self.transformer.config, "num_classes", getattr(self.transformer.config, "num_class_embeds", 1000))
+        )
         if guidance_scale is None:
             guidance_scale = 1.0
         if noise_scale is None:
             noise_scale = RECOMMENDED_NOISE_BY_SIZE.get(max(height, width), 1.0)
+        latents = randn_tensor(
                 shape=(batch_size, channels, height, width),
+            generator=generator,
+            device=self._execution_device,
+            dtype=self.transformer.dtype,
+        ) * noise_scale
         class_labels_t = torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
             class_labels_input = class_labels_t
         self.scheduler.set_timesteps(num_inference_steps, device=self._execution_device)
+        extra_step_kwargs = self.prepare_extra_step_kwargs(self.scheduler, generator=generator)
         for t in self.progress_bar(self.scheduler.timesteps):
             step_index = self.scheduler.index_for_timestep(t, self.scheduler.timesteps)
             sigma = self.scheduler.sigmas[step_index].to(device=latents.device, dtype=latents.dtype)
             sigma = sigma.reshape(*([1] * (latents.ndim - 1)))
             # JiT predicts x0; scheduler integrates in sigma space: dz/dsigma = -(x0 - z) / sigma.
             model_output = -(x_pred - latents) / sigma
+            latents = self.scheduler.step(model_output, t, latents, **extra_step_kwargs).prev_sample
         images_pt = ((latents.float().clamp(-1, 1) + 1.0) / 2.0).cpu()
         if output_type == "pt":
         if not return_dict:
             return (images,)
         return ImagePipelineOutput(images=images)
+JiTPipelineOutput = ImagePipelineOutput

JiT-L-32/pipeline.py CHANGED Viewed

@@ -1,36 +1,24 @@
-# Copyright 2026 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import importlib
 import json
-import sys
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-from diffusers.schedulers import FlowMatchHeunDiscreteScheduler, KarrasDiffusionSchedulers
 from diffusers.utils.torch_utils import randn_tensor
 RECOMMENDED_NOISE_BY_SIZE = {
     256: 1.0,
     512: 2.0,
 }
 class JiTPipeline(DiffusionPipeline):
     r"""
     Pipeline for image generation using JiT (Just image Transformer).
@@ -44,100 +32,43 @@ class JiTPipeline(DiffusionPipeline):
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
-    model_cpu_offload_seq = "transformer"
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
-        """Load a self-contained variant folder locally or from the Hub.
-        Examples:
-            JiTPipeline.from_pretrained(".")
-            JiTPipeline.from_pretrained("./JiT-H-32")
-            DiffusionPipeline.from_pretrained("BiliSakura/JiT-diffusers", subfolder="JiT-H-32", trust_remote_code=True)
-        """
-        repo_root = Path(__file__).resolve().parent
-        if pretrained_model_name_or_path in (None, "", "."):
-            variant = repo_root
-        elif (
-            isinstance(pretrained_model_name_or_path, str)
-            and "/" in pretrained_model_name_or_path
-            and not Path(pretrained_model_name_or_path).exists()
-        ):
-            from huggingface_hub import snapshot_download
-            hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
-            if subfolder:
-                hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
-            cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
-            variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
-        else:
-            variant = Path(pretrained_model_name_or_path)
-            if not variant.is_absolute():
-                candidate = (Path.cwd() / variant).resolve()
-                variant = candidate if candidate.exists() else (repo_root / variant).resolve()
-            if subfolder:
-                variant = variant / subfolder
-        id2label_override = kwargs.pop("id2label", None)
-        model_kwargs = dict(kwargs)
-        inserted: List[str] = []
-        def _load_component(folder: str, module_name: str, class_name: str):
-            comp_dir = variant / folder
-            module_path = comp_dir / f"{module_name}.py"
-            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
-            if not module_path.exists() or not has_weights:
-                return None
-            comp_path = str(comp_dir)
-            if comp_path not in sys.path:
-                sys.path.insert(0, comp_path)
-                inserted.append(comp_path)
-            module = importlib.import_module(module_name)
-            component_cls = getattr(module, class_name)
-            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
-        try:
-            transformer = _load_component("transformer", "jit_transformer_2d", "JiTTransformer2DModel")
-            try:
-                scheduler = FlowMatchHeunDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
-            except Exception:
-                scheduler = FlowMatchHeunDiscreteScheduler(shift=4.0)
-            if transformer is None:
-                raise ValueError(f"No loadable transformer found under {variant}")
-            variant_path = str(variant)
-            model_index_path = variant / "model_index.json"
-            id2label = id2label_override or cls._read_id2label_from_model_index(model_index_path)
-            pipe = cls(
-                transformer=transformer,
-                scheduler=scheduler,
-                id2label=id2label,
-            )
-            if variant_path and hasattr(pipe, "register_to_config"):
-                pipe.register_to_config(_name_or_path=variant_path)
-            return pipe
-        finally:
-            for comp_path in inserted:
-                if comp_path in sys.path:
-                    sys.path.remove(comp_path)
     def __init__(
         self,
         transformer,
-        scheduler: FlowMatchHeunDiscreteScheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         scheduler = scheduler or FlowMatchHeunDiscreteScheduler(shift=4.0)
         self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
     @staticmethod
     def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
@@ -146,7 +77,11 @@ class JiTPipeline(DiffusionPipeline):
         return {int(key): value for key, value in id2label.items()}
     @staticmethod
-    def _read_id2label_from_model_index(model_index_path: Path) -> Dict[int, str]:
         if not model_index_path.exists():
             return {}
         raw = json.loads(model_index_path.read_text(encoding="utf-8"))
@@ -167,20 +102,16 @@ class JiTPipeline(DiffusionPipeline):
     @property
     def id2label(self) -> Dict[int, str]:
-        """ImageNet class id to English label string (comma-separated synonyms)."""
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
-        r"""
-        Map ImageNet label strings to class ids.
-        Args:
-            label (`str` or `list[str]`):
-                One or more English label strings. Each string must match a synonym in `id2label`.
-        """
         label2id = self.labels
         if not label2id:
-            raise ValueError("No English labels loaded. Ensure `id2label` exists in model_index.json.")
         if isinstance(label, str):
             label = [label]
@@ -188,9 +119,7 @@ class JiTPipeline(DiffusionPipeline):
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
-            raise ValueError(
-                f"Unknown English label(s): {missing}. Example valid labels: {preview}, ..."
-            )
         return [label2id[item] for item in label]
     def _normalize_class_labels(
@@ -225,33 +154,10 @@ class JiTPipeline(DiffusionPipeline):
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Generate class-conditional images.
-        Args:
-            class_labels (`int`, `str`, `list[int]`, or `list[str]`):
-                ImageNet class indices or human-readable English label strings.
-            guidance_scale (`float`, *optional*):
-                Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
-            guidance_interval_min (`float`, defaults to `0.1`):
-                Lower bound of the CFG interval in flow time `t in [0, 1]`.
-            guidance_interval_max (`float`, defaults to `1.0`):
-                Upper bound of the CFG interval in flow time.
-            noise_scale (`float`, *optional*):
-                Initial Gaussian noise scale (`1.0` for 256px, `2.0` for 512px by default).
-            t_eps (`float`, defaults to `5e-2`):
-                Epsilon clamp for the `1 - t` denominator, matching JiT source defaults.
-            generator (`torch.Generator`, *optional*):
-                RNG for reproducibility.
-            num_inference_steps (`int`, defaults to `50`):
-                Number of solver steps (at least 2).
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                `"pil"`, `"np"`, or `"pt"`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Return [`ImagePipelineOutput`] if True.
-        """
         if num_inference_steps < 2:
             raise ValueError("num_inference_steps must be >= 2.")
         class_label_ids = self._normalize_class_labels(class_labels)
         do_classifier_free_guidance = guidance_scale is not None and guidance_scale > 1.0
@@ -268,22 +174,21 @@ class JiTPipeline(DiffusionPipeline):
                 f"height and width must be divisible by patch_size={patch_size}. Got {(height, width)}."
             )
         channels = int(self.transformer.config.in_channels)
-        null_class_val = int(self.transformer.config.num_classes)
         if guidance_scale is None:
             guidance_scale = 1.0
         if noise_scale is None:
             noise_scale = RECOMMENDED_NOISE_BY_SIZE.get(max(height, width), 1.0)
-        latents = (
-            randn_tensor(
                 shape=(batch_size, channels, height, width),
-                generator=generator,
-                device=self._execution_device,
-                dtype=self.transformer.dtype,
-            )
-            * noise_scale
-        )
         class_labels_t = torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
@@ -295,6 +200,7 @@ class JiTPipeline(DiffusionPipeline):
             class_labels_input = class_labels_t
         self.scheduler.set_timesteps(num_inference_steps, device=self._execution_device)
         for t in self.progress_bar(self.scheduler.timesteps):
             step_index = self.scheduler.index_for_timestep(t, self.scheduler.timesteps)
             sigma = self.scheduler.sigmas[step_index].to(device=latents.device, dtype=latents.dtype)
@@ -329,7 +235,7 @@ class JiTPipeline(DiffusionPipeline):
             sigma = sigma.reshape(*([1] * (latents.ndim - 1)))
             # JiT predicts x0; scheduler integrates in sigma space: dz/dsigma = -(x0 - z) / sigma.
             model_output = -(x_pred - latents) / sigma
-            latents = self.scheduler.step(model_output, t, latents).prev_sample
         images_pt = ((latents.float().clamp(-1, 1) + 1.0) / 2.0).cpu()
         if output_type == "pt":
@@ -344,3 +250,5 @@ class JiTPipeline(DiffusionPipeline):
         if not return_dict:
             return (images,)
         return ImagePipelineOutput(images=images)

+"""Hub custom pipeline: JiTPipeline.
+Load with native Hugging Face diffusers and trust_remote_code=True.
+"""
+from __future__ import annotations
+import inspect
 import json
 from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union, Any
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from diffusers.utils.torch_utils import randn_tensor
 RECOMMENDED_NOISE_BY_SIZE = {
     256: 1.0,
     512: 2.0,
 }
 class JiTPipeline(DiffusionPipeline):
     r"""
     Pipeline for image generation using JiT (Just image Transformer).
             ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
     """
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler,
+        generator=None,
+        eta: float | None = None,
+    ):
+        kwargs = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        if eta is not None and "eta" in step_params:
+            kwargs["eta"] = eta
+        return kwargs
+    model_cpu_offload_seq = "transformer"
     def __init__(
         self,
         transformer,
+        scheduler,
         id2label: Optional[Dict[Union[int, str], str]] = None,
     ):
         super().__init__()
         scheduler = scheduler or FlowMatchHeunDiscreteScheduler(shift=4.0)
         self.register_modules(transformer=transformer, scheduler=scheduler)
         self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = bool(self._id2label)
+    def _ensure_labels_loaded(self) -> None:
+        if self._labels_loaded_from_model_index:
+            return
+        loaded = self._read_id2label_from_model_index(getattr(self.config, "_name_or_path", None))
+        if loaded:
+            self._id2label = loaded
+            self.labels = self._build_label2id(self._id2label)
+        self._labels_loaded_from_model_index = True
     @staticmethod
     def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
         return {int(key): value for key, value in id2label.items()}
     @staticmethod
+    def _read_id2label_from_model_index(variant_path: Optional[str]) -> Dict[int, str]:
+        if not variant_path:
+            return {}
+        variant_dir = Path(variant_path).resolve()
+        model_index_path = variant_dir / "model_index.json"
         if not model_index_path.exists():
             return {}
         raw = json.loads(model_index_path.read_text(encoding="utf-8"))
     @property
     def id2label(self) -> Dict[int, str]:
+        self._ensure_labels_loaded()
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        self._ensure_labels_loaded()
         label2id = self.labels
         if not label2id:
+            raise ValueError(
+                "No English labels loaded. Ensure `id2label` exists in model_index.json."
+            )
         if isinstance(label, str):
             label = [label]
         missing = [item for item in label if item not in label2id]
         if missing:
             preview = ", ".join(list(label2id.keys())[:8])
+            raise ValueError(f"Unknown English label(s): {missing}. Example valid labels: {preview}, ...")
         return [label2id[item] for item in label]
     def _normalize_class_labels(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
     ) -> Union[ImagePipelineOutput, Tuple]:
         if num_inference_steps < 2:
             raise ValueError("num_inference_steps must be >= 2.")
+        if output_type not in {"pil", "np", "pt"}:
+            raise ValueError("output_type must be one of: 'pil', 'np', 'pt'.")
         class_label_ids = self._normalize_class_labels(class_labels)
         do_classifier_free_guidance = guidance_scale is not None and guidance_scale > 1.0
                 f"height and width must be divisible by patch_size={patch_size}. Got {(height, width)}."
             )
         channels = int(self.transformer.config.in_channels)
+        null_class_val = int(
+            getattr(self.transformer.config, "num_classes", getattr(self.transformer.config, "num_class_embeds", 1000))
+        )
         if guidance_scale is None:
             guidance_scale = 1.0
         if noise_scale is None:
             noise_scale = RECOMMENDED_NOISE_BY_SIZE.get(max(height, width), 1.0)
+        latents = randn_tensor(
                 shape=(batch_size, channels, height, width),
+            generator=generator,
+            device=self._execution_device,
+            dtype=self.transformer.dtype,
+        ) * noise_scale
         class_labels_t = torch.tensor(class_label_ids, device=self._execution_device, dtype=torch.long).reshape(-1)
         class_labels_t = class_labels_t.clamp(0, null_class_val - 1)
             class_labels_input = class_labels_t
         self.scheduler.set_timesteps(num_inference_steps, device=self._execution_device)
+        extra_step_kwargs = self.prepare_extra_step_kwargs(self.scheduler, generator=generator)
         for t in self.progress_bar(self.scheduler.timesteps):
             step_index = self.scheduler.index_for_timestep(t, self.scheduler.timesteps)
             sigma = self.scheduler.sigmas[step_index].to(device=latents.device, dtype=latents.dtype)
             sigma = sigma.reshape(*([1] * (latents.ndim - 1)))
             # JiT predicts x0; scheduler integrates in sigma space: dz/dsigma = -(x0 - z) / sigma.
             model_output = -(x_pred - latents) / sigma
+            latents = self.scheduler.step(model_output, t, latents, **extra_step_kwargs).prev_sample
         images_pt = ((latents.float().clamp(-1, 1) + 1.0) / 2.0).cpu()
         if output_type == "pt":
         if not return_dict:
             return (images,)
         return ImagePipelineOutput(images=images)
+JiTPipelineOutput = ImagePipelineOutput