Instructions to use BiliSakura/NiT-diffusers with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use BiliSakura/NiT-diffusers with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("BiliSakura/NiT-diffusers", dtype=torch.bfloat16, device_map="cuda") prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" image = pipe(prompt).images[0] - Notebooks
- Google Colab
- Kaggle
Fix generator determinism: forward generator through scheduler steps and seeded noise
Browse files- NiT-B/pipeline.py +41 -126
- NiT-L/pipeline.py +37 -122
- NiT-S/pipeline.py +41 -126
- NiT-XL/pipeline.py +36 -121
NiT-B/pipeline.py
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Copyright 2026 The HuggingFace Team. All rights reserved.
|
| 2 |
#
|
| 3 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
@@ -14,27 +22,24 @@
|
|
| 14 |
|
| 15 |
import json
|
| 16 |
from pathlib import Path
|
| 17 |
-
from typing import Dict, List, Optional, Tuple, Union
|
| 18 |
|
| 19 |
import torch
|
| 20 |
|
| 21 |
from diffusers.image_processor import VaeImageProcessor
|
| 22 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
|
| 23 |
-
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
|
| 24 |
from diffusers.utils.torch_utils import randn_tensor
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
DEFAULT_NATIVE_RESOLUTION = 256
|
| 29 |
|
| 30 |
EXAMPLE_DOC_STRING = """
|
| 31 |
Examples:
|
| 32 |
```py
|
| 33 |
>>> from pathlib import Path
|
| 34 |
-
>>> import torch
|
| 35 |
>>> from diffusers import DiffusionPipeline
|
|
|
|
| 36 |
|
| 37 |
-
>>> model_dir = Path("./NiT-
|
| 38 |
>>> pipe = DiffusionPipeline.from_pretrained(
|
| 39 |
... str(model_dir),
|
| 40 |
... local_files_only=True,
|
|
@@ -50,39 +55,45 @@ EXAMPLE_DOC_STRING = """
|
|
| 50 |
>>> generator = torch.Generator(device="cuda").manual_seed(42)
|
| 51 |
>>> image = pipe(
|
| 52 |
... class_labels="golden retriever",
|
| 53 |
-
... height=
|
| 54 |
-
... width=
|
| 55 |
... num_inference_steps=250,
|
| 56 |
-
... guidance_scale=2.
|
| 57 |
... guidance_interval=(0.0, 0.7),
|
| 58 |
... generator=generator,
|
| 59 |
... ).images[0]
|
| 60 |
-
>>> image.save("demo.png")
|
| 61 |
```
|
| 62 |
"""
|
| 63 |
|
| 64 |
-
|
| 65 |
class NiTPipeline(DiffusionPipeline):
|
| 66 |
r"""
|
| 67 |
Pipeline for native-resolution class-conditional image generation with NiT.
|
| 68 |
|
| 69 |
-
Uses the native [`FlowMatchEulerDiscreteScheduler`] in deterministic (ODE) mode.
|
| 70 |
-
The official NiT repo defaults to an Euler-Maruyama SDE sampler for 512×512; that SDE is
|
| 71 |
-
not the same as the scheduler's `stochastic_sampling` path, so keep
|
| 72 |
-
`scheduler.config.stochastic_sampling=False` and let the scheduler perform the ODE update
|
| 73 |
-
`x_{t+dt} = x_t + dt * v`.
|
| 74 |
-
|
| 75 |
Parameters:
|
| 76 |
transformer ([`NiTTransformer2DModel`]):
|
| 77 |
Class-conditional transformer that predicts flow-matching velocity in packed latent space.
|
| 78 |
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
| 79 |
-
|
| 80 |
vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
|
| 81 |
Variational autoencoder used to decode packed transformer latents to pixels.
|
| 82 |
id2label (`dict[int, str]`, *optional*):
|
| 83 |
ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
|
| 84 |
"""
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
model_cpu_offload_seq = "transformer->vae"
|
| 87 |
_optional_components = ["vae"]
|
| 88 |
|
|
@@ -100,95 +111,6 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 100 |
self.labels = self._build_label2id(self._id2label)
|
| 101 |
self._labels_loaded_from_model_index = bool(self._id2label)
|
| 102 |
|
| 103 |
-
@classmethod
|
| 104 |
-
def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
|
| 105 |
-
"""Load a self-contained variant folder locally or from the Hub."""
|
| 106 |
-
import importlib
|
| 107 |
-
import sys
|
| 108 |
-
|
| 109 |
-
repo_root = Path(__file__).resolve().parent
|
| 110 |
-
|
| 111 |
-
if pretrained_model_name_or_path in (None, "", "."):
|
| 112 |
-
variant = repo_root
|
| 113 |
-
elif (
|
| 114 |
-
isinstance(pretrained_model_name_or_path, str)
|
| 115 |
-
and "/" in pretrained_model_name_or_path
|
| 116 |
-
and not Path(pretrained_model_name_or_path).exists()
|
| 117 |
-
):
|
| 118 |
-
from huggingface_hub import snapshot_download
|
| 119 |
-
|
| 120 |
-
hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
|
| 121 |
-
if subfolder:
|
| 122 |
-
hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
|
| 123 |
-
cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
|
| 124 |
-
variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
|
| 125 |
-
else:
|
| 126 |
-
variant = Path(pretrained_model_name_or_path)
|
| 127 |
-
if not variant.is_absolute():
|
| 128 |
-
candidate = (Path.cwd() / variant).resolve()
|
| 129 |
-
variant = candidate if candidate.exists() else (repo_root / variant).resolve()
|
| 130 |
-
if subfolder:
|
| 131 |
-
variant = variant / subfolder
|
| 132 |
-
|
| 133 |
-
id2label_override = kwargs.pop("id2label", None)
|
| 134 |
-
model_kwargs = dict(kwargs)
|
| 135 |
-
inserted: List[str] = []
|
| 136 |
-
|
| 137 |
-
def _load_component(folder: str, module_name: str, class_name: str):
|
| 138 |
-
comp_dir = variant / folder
|
| 139 |
-
module_path = comp_dir / f"{module_name}.py"
|
| 140 |
-
has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
|
| 141 |
-
if not module_path.exists() or not has_weights:
|
| 142 |
-
return None
|
| 143 |
-
|
| 144 |
-
comp_path = str(comp_dir)
|
| 145 |
-
if comp_path not in sys.path:
|
| 146 |
-
sys.path.insert(0, comp_path)
|
| 147 |
-
inserted.append(comp_path)
|
| 148 |
-
|
| 149 |
-
module = importlib.import_module(module_name)
|
| 150 |
-
component_cls = getattr(module, class_name)
|
| 151 |
-
return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
|
| 152 |
-
|
| 153 |
-
try:
|
| 154 |
-
transformer = _load_component("transformer", "nit_transformer_2d", "NiTTransformer2DModel")
|
| 155 |
-
try:
|
| 156 |
-
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
|
| 157 |
-
except Exception:
|
| 158 |
-
scheduler = FlowMatchEulerDiscreteScheduler(
|
| 159 |
-
num_train_timesteps=1000,
|
| 160 |
-
shift=1.0,
|
| 161 |
-
stochastic_sampling=False,
|
| 162 |
-
)
|
| 163 |
-
if transformer is None:
|
| 164 |
-
raise ValueError(f"No loadable transformer found under {variant}")
|
| 165 |
-
|
| 166 |
-
vae = None
|
| 167 |
-
vae_dir = variant / "vae"
|
| 168 |
-
if vae_dir.exists() and (vae_dir / "config.json").exists():
|
| 169 |
-
from diffusers import AutoencoderDC, AutoencoderKL
|
| 170 |
-
|
| 171 |
-
vae_class_name = json.loads((vae_dir / "config.json").read_text(encoding="utf-8")).get(
|
| 172 |
-
"_class_name", "AutoencoderDC"
|
| 173 |
-
)
|
| 174 |
-
vae_cls = AutoencoderDC if vae_class_name == "AutoencoderDC" else AutoencoderKL
|
| 175 |
-
vae = vae_cls.from_pretrained(str(vae_dir), **model_kwargs)
|
| 176 |
-
|
| 177 |
-
id2label = id2label_override or cls._read_id2label_from_model_index(str(variant))
|
| 178 |
-
pipe = cls(
|
| 179 |
-
transformer=transformer,
|
| 180 |
-
scheduler=scheduler,
|
| 181 |
-
vae=vae,
|
| 182 |
-
id2label=id2label,
|
| 183 |
-
)
|
| 184 |
-
if hasattr(pipe, "register_to_config"):
|
| 185 |
-
pipe.register_to_config(_name_or_path=str(variant))
|
| 186 |
-
return pipe
|
| 187 |
-
finally:
|
| 188 |
-
for comp_path in inserted:
|
| 189 |
-
if comp_path in sys.path:
|
| 190 |
-
sys.path.remove(comp_path)
|
| 191 |
-
|
| 192 |
def _ensure_labels_loaded(self) -> None:
|
| 193 |
if self._labels_loaded_from_model_index:
|
| 194 |
return
|
|
@@ -339,11 +261,6 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 339 |
)
|
| 340 |
return packed_latents, image_sizes
|
| 341 |
|
| 342 |
-
@staticmethod
|
| 343 |
-
def _flow_time_from_scheduler_timestep(timestep: torch.Tensor, num_train_timesteps: int) -> float:
|
| 344 |
-
"""Map native scheduler timesteps (sigma * num_train_timesteps) to NiT flow time in [0, 1]."""
|
| 345 |
-
return float(timestep) / num_train_timesteps
|
| 346 |
-
|
| 347 |
def _apply_classifier_free_guidance(
|
| 348 |
self,
|
| 349 |
model_output: torch.Tensor,
|
|
@@ -407,8 +324,7 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 407 |
guidance_scale (`float`, defaults to `1.0`):
|
| 408 |
Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
|
| 409 |
guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
|
| 410 |
-
Flow-time interval where CFG is applied.
|
| 411 |
-
`timestep / num_train_timesteps`, matching the official NiT ODE sampler.
|
| 412 |
generator (`torch.Generator`, *optional*):
|
| 413 |
RNG for reproducibility.
|
| 414 |
output_type (`str`, defaults to `"pil"`):
|
|
@@ -421,14 +337,6 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 421 |
width = int(width or default_size)
|
| 422 |
self.check_inputs(height, width, num_inference_steps, output_type)
|
| 423 |
|
| 424 |
-
if getattr(self.scheduler.config, "stochastic_sampling", False):
|
| 425 |
-
raise ValueError(
|
| 426 |
-
"NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
|
| 427 |
-
"(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
|
| 428 |
-
"path uses a different update rule than the official NiT Euler-Maruyama SDE and "
|
| 429 |
-
"produces salt-and-pepper noise."
|
| 430 |
-
)
|
| 431 |
-
|
| 432 |
device = self._execution_device
|
| 433 |
model_dtype = next(self.transformer.parameters()).dtype
|
| 434 |
class_labels_tensor = self._normalize_class_labels(class_labels)
|
|
@@ -440,11 +348,19 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 440 |
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
| 441 |
num_train_timesteps = self.scheduler.config.num_train_timesteps
|
| 442 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
|
| 444 |
guidance_low, guidance_high = guidance_interval
|
| 445 |
|
| 446 |
for t in self.progress_bar(self.scheduler.timesteps):
|
| 447 |
-
flow_time =
|
| 448 |
guidance_active = guidance_low <= flow_time <= guidance_high
|
| 449 |
if guidance_scale > 1.0 and guidance_active:
|
| 450 |
model_input = torch.cat([packed_latents, packed_latents], dim=0)
|
|
@@ -479,5 +395,4 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 479 |
return (image,)
|
| 480 |
return ImagePipelineOutput(images=image)
|
| 481 |
|
| 482 |
-
|
| 483 |
-
NiTPipelineOutput = ImagePipelineOutput
|
|
|
|
| 1 |
+
"""Hub custom pipeline: NiTPipeline.
|
| 2 |
+
Load with native Hugging Face diffusers and trust_remote_code=True.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import inspect
|
| 8 |
+
|
| 9 |
# Copyright 2026 The HuggingFace Team. All rights reserved.
|
| 10 |
#
|
| 11 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
| 22 |
|
| 23 |
import json
|
| 24 |
from pathlib import Path
|
| 25 |
+
from typing import Dict, List, Optional, Tuple, Union, Any
|
| 26 |
|
| 27 |
import torch
|
| 28 |
|
| 29 |
from diffusers.image_processor import VaeImageProcessor
|
| 30 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
|
|
|
|
| 31 |
from diffusers.utils.torch_utils import randn_tensor
|
| 32 |
|
| 33 |
+
DEFAULT_NATIVE_RESOLUTION = 512
|
|
|
|
|
|
|
| 34 |
|
| 35 |
EXAMPLE_DOC_STRING = """
|
| 36 |
Examples:
|
| 37 |
```py
|
| 38 |
>>> from pathlib import Path
|
|
|
|
| 39 |
>>> from diffusers import DiffusionPipeline
|
| 40 |
+
>>> import torch
|
| 41 |
|
| 42 |
+
>>> model_dir = Path("./NiT-XL").resolve()
|
| 43 |
>>> pipe = DiffusionPipeline.from_pretrained(
|
| 44 |
... str(model_dir),
|
| 45 |
... local_files_only=True,
|
|
|
|
| 55 |
>>> generator = torch.Generator(device="cuda").manual_seed(42)
|
| 56 |
>>> image = pipe(
|
| 57 |
... class_labels="golden retriever",
|
| 58 |
+
... height=512,
|
| 59 |
+
... width=512,
|
| 60 |
... num_inference_steps=250,
|
| 61 |
+
... guidance_scale=2.05,
|
| 62 |
... guidance_interval=(0.0, 0.7),
|
| 63 |
... generator=generator,
|
| 64 |
... ).images[0]
|
|
|
|
| 65 |
```
|
| 66 |
"""
|
| 67 |
|
|
|
|
| 68 |
class NiTPipeline(DiffusionPipeline):
|
| 69 |
r"""
|
| 70 |
Pipeline for native-resolution class-conditional image generation with NiT.
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
Parameters:
|
| 73 |
transformer ([`NiTTransformer2DModel`]):
|
| 74 |
Class-conditional transformer that predicts flow-matching velocity in packed latent space.
|
| 75 |
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
| 76 |
+
Flow-matching Euler scheduler used by NiT.
|
| 77 |
vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
|
| 78 |
Variational autoencoder used to decode packed transformer latents to pixels.
|
| 79 |
id2label (`dict[int, str]`, *optional*):
|
| 80 |
ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
|
| 81 |
"""
|
| 82 |
|
| 83 |
+
@staticmethod
|
| 84 |
+
def prepare_extra_step_kwargs(
|
| 85 |
+
scheduler,
|
| 86 |
+
generator=None,
|
| 87 |
+
eta: float | None = None,
|
| 88 |
+
):
|
| 89 |
+
kwargs = {}
|
| 90 |
+
step_params = set(inspect.signature(scheduler.step).parameters.keys())
|
| 91 |
+
if "generator" in step_params:
|
| 92 |
+
kwargs["generator"] = generator
|
| 93 |
+
if eta is not None and "eta" in step_params:
|
| 94 |
+
kwargs["eta"] = eta
|
| 95 |
+
return kwargs
|
| 96 |
+
|
| 97 |
model_cpu_offload_seq = "transformer->vae"
|
| 98 |
_optional_components = ["vae"]
|
| 99 |
|
|
|
|
| 111 |
self.labels = self._build_label2id(self._id2label)
|
| 112 |
self._labels_loaded_from_model_index = bool(self._id2label)
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
def _ensure_labels_loaded(self) -> None:
|
| 115 |
if self._labels_loaded_from_model_index:
|
| 116 |
return
|
|
|
|
| 261 |
)
|
| 262 |
return packed_latents, image_sizes
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
def _apply_classifier_free_guidance(
|
| 265 |
self,
|
| 266 |
model_output: torch.Tensor,
|
|
|
|
| 324 |
guidance_scale (`float`, defaults to `1.0`):
|
| 325 |
Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
|
| 326 |
guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
|
| 327 |
+
Flow-time interval where CFG is applied.
|
|
|
|
| 328 |
generator (`torch.Generator`, *optional*):
|
| 329 |
RNG for reproducibility.
|
| 330 |
output_type (`str`, defaults to `"pil"`):
|
|
|
|
| 337 |
width = int(width or default_size)
|
| 338 |
self.check_inputs(height, width, num_inference_steps, output_type)
|
| 339 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
device = self._execution_device
|
| 341 |
model_dtype = next(self.transformer.parameters()).dtype
|
| 342 |
class_labels_tensor = self._normalize_class_labels(class_labels)
|
|
|
|
| 348 |
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
| 349 |
num_train_timesteps = self.scheduler.config.num_train_timesteps
|
| 350 |
|
| 351 |
+
if getattr(self.scheduler.config, "stochastic_sampling", False):
|
| 352 |
+
raise ValueError(
|
| 353 |
+
"NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
|
| 354 |
+
"(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
|
| 355 |
+
"path uses a different update rule than the official NiT Euler-Maruyama SDE and "
|
| 356 |
+
"produces salt-and-pepper noise."
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
|
| 360 |
guidance_low, guidance_high = guidance_interval
|
| 361 |
|
| 362 |
for t in self.progress_bar(self.scheduler.timesteps):
|
| 363 |
+
flow_time = float(t) / num_train_timesteps
|
| 364 |
guidance_active = guidance_low <= flow_time <= guidance_high
|
| 365 |
if guidance_scale > 1.0 and guidance_active:
|
| 366 |
model_input = torch.cat([packed_latents, packed_latents], dim=0)
|
|
|
|
| 395 |
return (image,)
|
| 396 |
return ImagePipelineOutput(images=image)
|
| 397 |
|
| 398 |
+
NiTPipelineOutput = ImagePipelineOutput
|
|
|
NiT-L/pipeline.py
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Copyright 2026 The HuggingFace Team. All rights reserved.
|
| 2 |
#
|
| 3 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
@@ -14,27 +22,24 @@
|
|
| 14 |
|
| 15 |
import json
|
| 16 |
from pathlib import Path
|
| 17 |
-
from typing import Dict, List, Optional, Tuple, Union
|
| 18 |
|
| 19 |
import torch
|
| 20 |
|
| 21 |
from diffusers.image_processor import VaeImageProcessor
|
| 22 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
|
| 23 |
-
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
|
| 24 |
from diffusers.utils.torch_utils import randn_tensor
|
| 25 |
|
| 26 |
-
# Local component classes are loaded dynamically in from_pretrained.
|
| 27 |
-
|
| 28 |
DEFAULT_NATIVE_RESOLUTION = 512
|
| 29 |
|
| 30 |
EXAMPLE_DOC_STRING = """
|
| 31 |
Examples:
|
| 32 |
```py
|
| 33 |
>>> from pathlib import Path
|
| 34 |
-
>>> import torch
|
| 35 |
>>> from diffusers import DiffusionPipeline
|
|
|
|
| 36 |
|
| 37 |
-
>>> model_dir = Path("./NiT-
|
| 38 |
>>> pipe = DiffusionPipeline.from_pretrained(
|
| 39 |
... str(model_dir),
|
| 40 |
... local_files_only=True,
|
|
@@ -57,32 +62,38 @@ EXAMPLE_DOC_STRING = """
|
|
| 57 |
... guidance_interval=(0.0, 0.7),
|
| 58 |
... generator=generator,
|
| 59 |
... ).images[0]
|
| 60 |
-
>>> image.save("demo.png")
|
| 61 |
```
|
| 62 |
"""
|
| 63 |
|
| 64 |
-
|
| 65 |
class NiTPipeline(DiffusionPipeline):
|
| 66 |
r"""
|
| 67 |
Pipeline for native-resolution class-conditional image generation with NiT.
|
| 68 |
|
| 69 |
-
Uses the native [`FlowMatchEulerDiscreteScheduler`] in deterministic (ODE) mode.
|
| 70 |
-
The official NiT repo defaults to an Euler-Maruyama SDE sampler for 512×512; that SDE is
|
| 71 |
-
not the same as the scheduler's `stochastic_sampling` path, so keep
|
| 72 |
-
`scheduler.config.stochastic_sampling=False` and let the scheduler perform the ODE update
|
| 73 |
-
`x_{t+dt} = x_t + dt * v`.
|
| 74 |
-
|
| 75 |
Parameters:
|
| 76 |
transformer ([`NiTTransformer2DModel`]):
|
| 77 |
Class-conditional transformer that predicts flow-matching velocity in packed latent space.
|
| 78 |
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
| 79 |
-
|
| 80 |
vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
|
| 81 |
Variational autoencoder used to decode packed transformer latents to pixels.
|
| 82 |
id2label (`dict[int, str]`, *optional*):
|
| 83 |
ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
|
| 84 |
"""
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
model_cpu_offload_seq = "transformer->vae"
|
| 87 |
_optional_components = ["vae"]
|
| 88 |
|
|
@@ -100,95 +111,6 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 100 |
self.labels = self._build_label2id(self._id2label)
|
| 101 |
self._labels_loaded_from_model_index = bool(self._id2label)
|
| 102 |
|
| 103 |
-
@classmethod
|
| 104 |
-
def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
|
| 105 |
-
"""Load a self-contained variant folder locally or from the Hub."""
|
| 106 |
-
import importlib
|
| 107 |
-
import sys
|
| 108 |
-
|
| 109 |
-
repo_root = Path(__file__).resolve().parent
|
| 110 |
-
|
| 111 |
-
if pretrained_model_name_or_path in (None, "", "."):
|
| 112 |
-
variant = repo_root
|
| 113 |
-
elif (
|
| 114 |
-
isinstance(pretrained_model_name_or_path, str)
|
| 115 |
-
and "/" in pretrained_model_name_or_path
|
| 116 |
-
and not Path(pretrained_model_name_or_path).exists()
|
| 117 |
-
):
|
| 118 |
-
from huggingface_hub import snapshot_download
|
| 119 |
-
|
| 120 |
-
hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
|
| 121 |
-
if subfolder:
|
| 122 |
-
hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
|
| 123 |
-
cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
|
| 124 |
-
variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
|
| 125 |
-
else:
|
| 126 |
-
variant = Path(pretrained_model_name_or_path)
|
| 127 |
-
if not variant.is_absolute():
|
| 128 |
-
candidate = (Path.cwd() / variant).resolve()
|
| 129 |
-
variant = candidate if candidate.exists() else (repo_root / variant).resolve()
|
| 130 |
-
if subfolder:
|
| 131 |
-
variant = variant / subfolder
|
| 132 |
-
|
| 133 |
-
id2label_override = kwargs.pop("id2label", None)
|
| 134 |
-
model_kwargs = dict(kwargs)
|
| 135 |
-
inserted: List[str] = []
|
| 136 |
-
|
| 137 |
-
def _load_component(folder: str, module_name: str, class_name: str):
|
| 138 |
-
comp_dir = variant / folder
|
| 139 |
-
module_path = comp_dir / f"{module_name}.py"
|
| 140 |
-
has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
|
| 141 |
-
if not module_path.exists() or not has_weights:
|
| 142 |
-
return None
|
| 143 |
-
|
| 144 |
-
comp_path = str(comp_dir)
|
| 145 |
-
if comp_path not in sys.path:
|
| 146 |
-
sys.path.insert(0, comp_path)
|
| 147 |
-
inserted.append(comp_path)
|
| 148 |
-
|
| 149 |
-
module = importlib.import_module(module_name)
|
| 150 |
-
component_cls = getattr(module, class_name)
|
| 151 |
-
return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
|
| 152 |
-
|
| 153 |
-
try:
|
| 154 |
-
transformer = _load_component("transformer", "nit_transformer_2d", "NiTTransformer2DModel")
|
| 155 |
-
try:
|
| 156 |
-
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
|
| 157 |
-
except Exception:
|
| 158 |
-
scheduler = FlowMatchEulerDiscreteScheduler(
|
| 159 |
-
num_train_timesteps=1000,
|
| 160 |
-
shift=1.0,
|
| 161 |
-
stochastic_sampling=False,
|
| 162 |
-
)
|
| 163 |
-
if transformer is None:
|
| 164 |
-
raise ValueError(f"No loadable transformer found under {variant}")
|
| 165 |
-
|
| 166 |
-
vae = None
|
| 167 |
-
vae_dir = variant / "vae"
|
| 168 |
-
if vae_dir.exists() and (vae_dir / "config.json").exists():
|
| 169 |
-
from diffusers import AutoencoderDC, AutoencoderKL
|
| 170 |
-
|
| 171 |
-
vae_class_name = json.loads((vae_dir / "config.json").read_text(encoding="utf-8")).get(
|
| 172 |
-
"_class_name", "AutoencoderDC"
|
| 173 |
-
)
|
| 174 |
-
vae_cls = AutoencoderDC if vae_class_name == "AutoencoderDC" else AutoencoderKL
|
| 175 |
-
vae = vae_cls.from_pretrained(str(vae_dir), **model_kwargs)
|
| 176 |
-
|
| 177 |
-
id2label = id2label_override or cls._read_id2label_from_model_index(str(variant))
|
| 178 |
-
pipe = cls(
|
| 179 |
-
transformer=transformer,
|
| 180 |
-
scheduler=scheduler,
|
| 181 |
-
vae=vae,
|
| 182 |
-
id2label=id2label,
|
| 183 |
-
)
|
| 184 |
-
if hasattr(pipe, "register_to_config"):
|
| 185 |
-
pipe.register_to_config(_name_or_path=str(variant))
|
| 186 |
-
return pipe
|
| 187 |
-
finally:
|
| 188 |
-
for comp_path in inserted:
|
| 189 |
-
if comp_path in sys.path:
|
| 190 |
-
sys.path.remove(comp_path)
|
| 191 |
-
|
| 192 |
def _ensure_labels_loaded(self) -> None:
|
| 193 |
if self._labels_loaded_from_model_index:
|
| 194 |
return
|
|
@@ -339,11 +261,6 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 339 |
)
|
| 340 |
return packed_latents, image_sizes
|
| 341 |
|
| 342 |
-
@staticmethod
|
| 343 |
-
def _flow_time_from_scheduler_timestep(timestep: torch.Tensor, num_train_timesteps: int) -> float:
|
| 344 |
-
"""Map native scheduler timesteps (sigma * num_train_timesteps) to NiT flow time in [0, 1]."""
|
| 345 |
-
return float(timestep) / num_train_timesteps
|
| 346 |
-
|
| 347 |
def _apply_classifier_free_guidance(
|
| 348 |
self,
|
| 349 |
model_output: torch.Tensor,
|
|
@@ -407,8 +324,7 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 407 |
guidance_scale (`float`, defaults to `1.0`):
|
| 408 |
Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
|
| 409 |
guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
|
| 410 |
-
Flow-time interval where CFG is applied.
|
| 411 |
-
`timestep / num_train_timesteps`, matching the official NiT ODE sampler.
|
| 412 |
generator (`torch.Generator`, *optional*):
|
| 413 |
RNG for reproducibility.
|
| 414 |
output_type (`str`, defaults to `"pil"`):
|
|
@@ -421,14 +337,6 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 421 |
width = int(width or default_size)
|
| 422 |
self.check_inputs(height, width, num_inference_steps, output_type)
|
| 423 |
|
| 424 |
-
if getattr(self.scheduler.config, "stochastic_sampling", False):
|
| 425 |
-
raise ValueError(
|
| 426 |
-
"NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
|
| 427 |
-
"(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
|
| 428 |
-
"path uses a different update rule than the official NiT Euler-Maruyama SDE and "
|
| 429 |
-
"produces salt-and-pepper noise."
|
| 430 |
-
)
|
| 431 |
-
|
| 432 |
device = self._execution_device
|
| 433 |
model_dtype = next(self.transformer.parameters()).dtype
|
| 434 |
class_labels_tensor = self._normalize_class_labels(class_labels)
|
|
@@ -440,11 +348,19 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 440 |
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
| 441 |
num_train_timesteps = self.scheduler.config.num_train_timesteps
|
| 442 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
|
| 444 |
guidance_low, guidance_high = guidance_interval
|
| 445 |
|
| 446 |
for t in self.progress_bar(self.scheduler.timesteps):
|
| 447 |
-
flow_time =
|
| 448 |
guidance_active = guidance_low <= flow_time <= guidance_high
|
| 449 |
if guidance_scale > 1.0 and guidance_active:
|
| 450 |
model_input = torch.cat([packed_latents, packed_latents], dim=0)
|
|
@@ -479,5 +395,4 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 479 |
return (image,)
|
| 480 |
return ImagePipelineOutput(images=image)
|
| 481 |
|
| 482 |
-
|
| 483 |
-
NiTPipelineOutput = ImagePipelineOutput
|
|
|
|
| 1 |
+
"""Hub custom pipeline: NiTPipeline.
|
| 2 |
+
Load with native Hugging Face diffusers and trust_remote_code=True.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import inspect
|
| 8 |
+
|
| 9 |
# Copyright 2026 The HuggingFace Team. All rights reserved.
|
| 10 |
#
|
| 11 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
| 22 |
|
| 23 |
import json
|
| 24 |
from pathlib import Path
|
| 25 |
+
from typing import Dict, List, Optional, Tuple, Union, Any
|
| 26 |
|
| 27 |
import torch
|
| 28 |
|
| 29 |
from diffusers.image_processor import VaeImageProcessor
|
| 30 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
|
|
|
|
| 31 |
from diffusers.utils.torch_utils import randn_tensor
|
| 32 |
|
|
|
|
|
|
|
| 33 |
DEFAULT_NATIVE_RESOLUTION = 512
|
| 34 |
|
| 35 |
EXAMPLE_DOC_STRING = """
|
| 36 |
Examples:
|
| 37 |
```py
|
| 38 |
>>> from pathlib import Path
|
|
|
|
| 39 |
>>> from diffusers import DiffusionPipeline
|
| 40 |
+
>>> import torch
|
| 41 |
|
| 42 |
+
>>> model_dir = Path("./NiT-XL").resolve()
|
| 43 |
>>> pipe = DiffusionPipeline.from_pretrained(
|
| 44 |
... str(model_dir),
|
| 45 |
... local_files_only=True,
|
|
|
|
| 62 |
... guidance_interval=(0.0, 0.7),
|
| 63 |
... generator=generator,
|
| 64 |
... ).images[0]
|
|
|
|
| 65 |
```
|
| 66 |
"""
|
| 67 |
|
|
|
|
| 68 |
class NiTPipeline(DiffusionPipeline):
|
| 69 |
r"""
|
| 70 |
Pipeline for native-resolution class-conditional image generation with NiT.
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
Parameters:
|
| 73 |
transformer ([`NiTTransformer2DModel`]):
|
| 74 |
Class-conditional transformer that predicts flow-matching velocity in packed latent space.
|
| 75 |
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
| 76 |
+
Flow-matching Euler scheduler used by NiT.
|
| 77 |
vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
|
| 78 |
Variational autoencoder used to decode packed transformer latents to pixels.
|
| 79 |
id2label (`dict[int, str]`, *optional*):
|
| 80 |
ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
|
| 81 |
"""
|
| 82 |
|
| 83 |
+
@staticmethod
|
| 84 |
+
def prepare_extra_step_kwargs(
|
| 85 |
+
scheduler,
|
| 86 |
+
generator=None,
|
| 87 |
+
eta: float | None = None,
|
| 88 |
+
):
|
| 89 |
+
kwargs = {}
|
| 90 |
+
step_params = set(inspect.signature(scheduler.step).parameters.keys())
|
| 91 |
+
if "generator" in step_params:
|
| 92 |
+
kwargs["generator"] = generator
|
| 93 |
+
if eta is not None and "eta" in step_params:
|
| 94 |
+
kwargs["eta"] = eta
|
| 95 |
+
return kwargs
|
| 96 |
+
|
| 97 |
model_cpu_offload_seq = "transformer->vae"
|
| 98 |
_optional_components = ["vae"]
|
| 99 |
|
|
|
|
| 111 |
self.labels = self._build_label2id(self._id2label)
|
| 112 |
self._labels_loaded_from_model_index = bool(self._id2label)
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
def _ensure_labels_loaded(self) -> None:
|
| 115 |
if self._labels_loaded_from_model_index:
|
| 116 |
return
|
|
|
|
| 261 |
)
|
| 262 |
return packed_latents, image_sizes
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
def _apply_classifier_free_guidance(
|
| 265 |
self,
|
| 266 |
model_output: torch.Tensor,
|
|
|
|
| 324 |
guidance_scale (`float`, defaults to `1.0`):
|
| 325 |
Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
|
| 326 |
guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
|
| 327 |
+
Flow-time interval where CFG is applied.
|
|
|
|
| 328 |
generator (`torch.Generator`, *optional*):
|
| 329 |
RNG for reproducibility.
|
| 330 |
output_type (`str`, defaults to `"pil"`):
|
|
|
|
| 337 |
width = int(width or default_size)
|
| 338 |
self.check_inputs(height, width, num_inference_steps, output_type)
|
| 339 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
device = self._execution_device
|
| 341 |
model_dtype = next(self.transformer.parameters()).dtype
|
| 342 |
class_labels_tensor = self._normalize_class_labels(class_labels)
|
|
|
|
| 348 |
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
| 349 |
num_train_timesteps = self.scheduler.config.num_train_timesteps
|
| 350 |
|
| 351 |
+
if getattr(self.scheduler.config, "stochastic_sampling", False):
|
| 352 |
+
raise ValueError(
|
| 353 |
+
"NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
|
| 354 |
+
"(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
|
| 355 |
+
"path uses a different update rule than the official NiT Euler-Maruyama SDE and "
|
| 356 |
+
"produces salt-and-pepper noise."
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
|
| 360 |
guidance_low, guidance_high = guidance_interval
|
| 361 |
|
| 362 |
for t in self.progress_bar(self.scheduler.timesteps):
|
| 363 |
+
flow_time = float(t) / num_train_timesteps
|
| 364 |
guidance_active = guidance_low <= flow_time <= guidance_high
|
| 365 |
if guidance_scale > 1.0 and guidance_active:
|
| 366 |
model_input = torch.cat([packed_latents, packed_latents], dim=0)
|
|
|
|
| 395 |
return (image,)
|
| 396 |
return ImagePipelineOutput(images=image)
|
| 397 |
|
| 398 |
+
NiTPipelineOutput = ImagePipelineOutput
|
|
|
NiT-S/pipeline.py
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Copyright 2026 The HuggingFace Team. All rights reserved.
|
| 2 |
#
|
| 3 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
@@ -14,27 +22,24 @@
|
|
| 14 |
|
| 15 |
import json
|
| 16 |
from pathlib import Path
|
| 17 |
-
from typing import Dict, List, Optional, Tuple, Union
|
| 18 |
|
| 19 |
import torch
|
| 20 |
|
| 21 |
from diffusers.image_processor import VaeImageProcessor
|
| 22 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
|
| 23 |
-
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
|
| 24 |
from diffusers.utils.torch_utils import randn_tensor
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
DEFAULT_NATIVE_RESOLUTION = 256
|
| 29 |
|
| 30 |
EXAMPLE_DOC_STRING = """
|
| 31 |
Examples:
|
| 32 |
```py
|
| 33 |
>>> from pathlib import Path
|
| 34 |
-
>>> import torch
|
| 35 |
>>> from diffusers import DiffusionPipeline
|
|
|
|
| 36 |
|
| 37 |
-
>>> model_dir = Path("./NiT-
|
| 38 |
>>> pipe = DiffusionPipeline.from_pretrained(
|
| 39 |
... str(model_dir),
|
| 40 |
... local_files_only=True,
|
|
@@ -50,39 +55,45 @@ EXAMPLE_DOC_STRING = """
|
|
| 50 |
>>> generator = torch.Generator(device="cuda").manual_seed(42)
|
| 51 |
>>> image = pipe(
|
| 52 |
... class_labels="golden retriever",
|
| 53 |
-
... height=
|
| 54 |
-
... width=
|
| 55 |
... num_inference_steps=250,
|
| 56 |
-
... guidance_scale=2.
|
| 57 |
... guidance_interval=(0.0, 0.7),
|
| 58 |
... generator=generator,
|
| 59 |
... ).images[0]
|
| 60 |
-
>>> image.save("demo.png")
|
| 61 |
```
|
| 62 |
"""
|
| 63 |
|
| 64 |
-
|
| 65 |
class NiTPipeline(DiffusionPipeline):
|
| 66 |
r"""
|
| 67 |
Pipeline for native-resolution class-conditional image generation with NiT.
|
| 68 |
|
| 69 |
-
Uses the native [`FlowMatchEulerDiscreteScheduler`] in deterministic (ODE) mode.
|
| 70 |
-
The official NiT repo defaults to an Euler-Maruyama SDE sampler for 512×512; that SDE is
|
| 71 |
-
not the same as the scheduler's `stochastic_sampling` path, so keep
|
| 72 |
-
`scheduler.config.stochastic_sampling=False` and let the scheduler perform the ODE update
|
| 73 |
-
`x_{t+dt} = x_t + dt * v`.
|
| 74 |
-
|
| 75 |
Parameters:
|
| 76 |
transformer ([`NiTTransformer2DModel`]):
|
| 77 |
Class-conditional transformer that predicts flow-matching velocity in packed latent space.
|
| 78 |
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
| 79 |
-
|
| 80 |
vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
|
| 81 |
Variational autoencoder used to decode packed transformer latents to pixels.
|
| 82 |
id2label (`dict[int, str]`, *optional*):
|
| 83 |
ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
|
| 84 |
"""
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
model_cpu_offload_seq = "transformer->vae"
|
| 87 |
_optional_components = ["vae"]
|
| 88 |
|
|
@@ -100,95 +111,6 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 100 |
self.labels = self._build_label2id(self._id2label)
|
| 101 |
self._labels_loaded_from_model_index = bool(self._id2label)
|
| 102 |
|
| 103 |
-
@classmethod
|
| 104 |
-
def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
|
| 105 |
-
"""Load a self-contained variant folder locally or from the Hub."""
|
| 106 |
-
import importlib
|
| 107 |
-
import sys
|
| 108 |
-
|
| 109 |
-
repo_root = Path(__file__).resolve().parent
|
| 110 |
-
|
| 111 |
-
if pretrained_model_name_or_path in (None, "", "."):
|
| 112 |
-
variant = repo_root
|
| 113 |
-
elif (
|
| 114 |
-
isinstance(pretrained_model_name_or_path, str)
|
| 115 |
-
and "/" in pretrained_model_name_or_path
|
| 116 |
-
and not Path(pretrained_model_name_or_path).exists()
|
| 117 |
-
):
|
| 118 |
-
from huggingface_hub import snapshot_download
|
| 119 |
-
|
| 120 |
-
hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
|
| 121 |
-
if subfolder:
|
| 122 |
-
hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
|
| 123 |
-
cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
|
| 124 |
-
variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
|
| 125 |
-
else:
|
| 126 |
-
variant = Path(pretrained_model_name_or_path)
|
| 127 |
-
if not variant.is_absolute():
|
| 128 |
-
candidate = (Path.cwd() / variant).resolve()
|
| 129 |
-
variant = candidate if candidate.exists() else (repo_root / variant).resolve()
|
| 130 |
-
if subfolder:
|
| 131 |
-
variant = variant / subfolder
|
| 132 |
-
|
| 133 |
-
id2label_override = kwargs.pop("id2label", None)
|
| 134 |
-
model_kwargs = dict(kwargs)
|
| 135 |
-
inserted: List[str] = []
|
| 136 |
-
|
| 137 |
-
def _load_component(folder: str, module_name: str, class_name: str):
|
| 138 |
-
comp_dir = variant / folder
|
| 139 |
-
module_path = comp_dir / f"{module_name}.py"
|
| 140 |
-
has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
|
| 141 |
-
if not module_path.exists() or not has_weights:
|
| 142 |
-
return None
|
| 143 |
-
|
| 144 |
-
comp_path = str(comp_dir)
|
| 145 |
-
if comp_path not in sys.path:
|
| 146 |
-
sys.path.insert(0, comp_path)
|
| 147 |
-
inserted.append(comp_path)
|
| 148 |
-
|
| 149 |
-
module = importlib.import_module(module_name)
|
| 150 |
-
component_cls = getattr(module, class_name)
|
| 151 |
-
return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
|
| 152 |
-
|
| 153 |
-
try:
|
| 154 |
-
transformer = _load_component("transformer", "nit_transformer_2d", "NiTTransformer2DModel")
|
| 155 |
-
try:
|
| 156 |
-
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
|
| 157 |
-
except Exception:
|
| 158 |
-
scheduler = FlowMatchEulerDiscreteScheduler(
|
| 159 |
-
num_train_timesteps=1000,
|
| 160 |
-
shift=1.0,
|
| 161 |
-
stochastic_sampling=False,
|
| 162 |
-
)
|
| 163 |
-
if transformer is None:
|
| 164 |
-
raise ValueError(f"No loadable transformer found under {variant}")
|
| 165 |
-
|
| 166 |
-
vae = None
|
| 167 |
-
vae_dir = variant / "vae"
|
| 168 |
-
if vae_dir.exists() and (vae_dir / "config.json").exists():
|
| 169 |
-
from diffusers import AutoencoderDC, AutoencoderKL
|
| 170 |
-
|
| 171 |
-
vae_class_name = json.loads((vae_dir / "config.json").read_text(encoding="utf-8")).get(
|
| 172 |
-
"_class_name", "AutoencoderDC"
|
| 173 |
-
)
|
| 174 |
-
vae_cls = AutoencoderDC if vae_class_name == "AutoencoderDC" else AutoencoderKL
|
| 175 |
-
vae = vae_cls.from_pretrained(str(vae_dir), **model_kwargs)
|
| 176 |
-
|
| 177 |
-
id2label = id2label_override or cls._read_id2label_from_model_index(str(variant))
|
| 178 |
-
pipe = cls(
|
| 179 |
-
transformer=transformer,
|
| 180 |
-
scheduler=scheduler,
|
| 181 |
-
vae=vae,
|
| 182 |
-
id2label=id2label,
|
| 183 |
-
)
|
| 184 |
-
if hasattr(pipe, "register_to_config"):
|
| 185 |
-
pipe.register_to_config(_name_or_path=str(variant))
|
| 186 |
-
return pipe
|
| 187 |
-
finally:
|
| 188 |
-
for comp_path in inserted:
|
| 189 |
-
if comp_path in sys.path:
|
| 190 |
-
sys.path.remove(comp_path)
|
| 191 |
-
|
| 192 |
def _ensure_labels_loaded(self) -> None:
|
| 193 |
if self._labels_loaded_from_model_index:
|
| 194 |
return
|
|
@@ -339,11 +261,6 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 339 |
)
|
| 340 |
return packed_latents, image_sizes
|
| 341 |
|
| 342 |
-
@staticmethod
|
| 343 |
-
def _flow_time_from_scheduler_timestep(timestep: torch.Tensor, num_train_timesteps: int) -> float:
|
| 344 |
-
"""Map native scheduler timesteps (sigma * num_train_timesteps) to NiT flow time in [0, 1]."""
|
| 345 |
-
return float(timestep) / num_train_timesteps
|
| 346 |
-
|
| 347 |
def _apply_classifier_free_guidance(
|
| 348 |
self,
|
| 349 |
model_output: torch.Tensor,
|
|
@@ -407,8 +324,7 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 407 |
guidance_scale (`float`, defaults to `1.0`):
|
| 408 |
Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
|
| 409 |
guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
|
| 410 |
-
Flow-time interval where CFG is applied.
|
| 411 |
-
`timestep / num_train_timesteps`, matching the official NiT ODE sampler.
|
| 412 |
generator (`torch.Generator`, *optional*):
|
| 413 |
RNG for reproducibility.
|
| 414 |
output_type (`str`, defaults to `"pil"`):
|
|
@@ -421,14 +337,6 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 421 |
width = int(width or default_size)
|
| 422 |
self.check_inputs(height, width, num_inference_steps, output_type)
|
| 423 |
|
| 424 |
-
if getattr(self.scheduler.config, "stochastic_sampling", False):
|
| 425 |
-
raise ValueError(
|
| 426 |
-
"NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
|
| 427 |
-
"(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
|
| 428 |
-
"path uses a different update rule than the official NiT Euler-Maruyama SDE and "
|
| 429 |
-
"produces salt-and-pepper noise."
|
| 430 |
-
)
|
| 431 |
-
|
| 432 |
device = self._execution_device
|
| 433 |
model_dtype = next(self.transformer.parameters()).dtype
|
| 434 |
class_labels_tensor = self._normalize_class_labels(class_labels)
|
|
@@ -440,11 +348,19 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 440 |
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
| 441 |
num_train_timesteps = self.scheduler.config.num_train_timesteps
|
| 442 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
|
| 444 |
guidance_low, guidance_high = guidance_interval
|
| 445 |
|
| 446 |
for t in self.progress_bar(self.scheduler.timesteps):
|
| 447 |
-
flow_time =
|
| 448 |
guidance_active = guidance_low <= flow_time <= guidance_high
|
| 449 |
if guidance_scale > 1.0 and guidance_active:
|
| 450 |
model_input = torch.cat([packed_latents, packed_latents], dim=0)
|
|
@@ -479,5 +395,4 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 479 |
return (image,)
|
| 480 |
return ImagePipelineOutput(images=image)
|
| 481 |
|
| 482 |
-
|
| 483 |
-
NiTPipelineOutput = ImagePipelineOutput
|
|
|
|
| 1 |
+
"""Hub custom pipeline: NiTPipeline.
|
| 2 |
+
Load with native Hugging Face diffusers and trust_remote_code=True.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import inspect
|
| 8 |
+
|
| 9 |
# Copyright 2026 The HuggingFace Team. All rights reserved.
|
| 10 |
#
|
| 11 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
| 22 |
|
| 23 |
import json
|
| 24 |
from pathlib import Path
|
| 25 |
+
from typing import Dict, List, Optional, Tuple, Union, Any
|
| 26 |
|
| 27 |
import torch
|
| 28 |
|
| 29 |
from diffusers.image_processor import VaeImageProcessor
|
| 30 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
|
|
|
|
| 31 |
from diffusers.utils.torch_utils import randn_tensor
|
| 32 |
|
| 33 |
+
DEFAULT_NATIVE_RESOLUTION = 512
|
|
|
|
|
|
|
| 34 |
|
| 35 |
EXAMPLE_DOC_STRING = """
|
| 36 |
Examples:
|
| 37 |
```py
|
| 38 |
>>> from pathlib import Path
|
|
|
|
| 39 |
>>> from diffusers import DiffusionPipeline
|
| 40 |
+
>>> import torch
|
| 41 |
|
| 42 |
+
>>> model_dir = Path("./NiT-XL").resolve()
|
| 43 |
>>> pipe = DiffusionPipeline.from_pretrained(
|
| 44 |
... str(model_dir),
|
| 45 |
... local_files_only=True,
|
|
|
|
| 55 |
>>> generator = torch.Generator(device="cuda").manual_seed(42)
|
| 56 |
>>> image = pipe(
|
| 57 |
... class_labels="golden retriever",
|
| 58 |
+
... height=512,
|
| 59 |
+
... width=512,
|
| 60 |
... num_inference_steps=250,
|
| 61 |
+
... guidance_scale=2.05,
|
| 62 |
... guidance_interval=(0.0, 0.7),
|
| 63 |
... generator=generator,
|
| 64 |
... ).images[0]
|
|
|
|
| 65 |
```
|
| 66 |
"""
|
| 67 |
|
|
|
|
| 68 |
class NiTPipeline(DiffusionPipeline):
|
| 69 |
r"""
|
| 70 |
Pipeline for native-resolution class-conditional image generation with NiT.
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
Parameters:
|
| 73 |
transformer ([`NiTTransformer2DModel`]):
|
| 74 |
Class-conditional transformer that predicts flow-matching velocity in packed latent space.
|
| 75 |
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
| 76 |
+
Flow-matching Euler scheduler used by NiT.
|
| 77 |
vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
|
| 78 |
Variational autoencoder used to decode packed transformer latents to pixels.
|
| 79 |
id2label (`dict[int, str]`, *optional*):
|
| 80 |
ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
|
| 81 |
"""
|
| 82 |
|
| 83 |
+
@staticmethod
|
| 84 |
+
def prepare_extra_step_kwargs(
|
| 85 |
+
scheduler,
|
| 86 |
+
generator=None,
|
| 87 |
+
eta: float | None = None,
|
| 88 |
+
):
|
| 89 |
+
kwargs = {}
|
| 90 |
+
step_params = set(inspect.signature(scheduler.step).parameters.keys())
|
| 91 |
+
if "generator" in step_params:
|
| 92 |
+
kwargs["generator"] = generator
|
| 93 |
+
if eta is not None and "eta" in step_params:
|
| 94 |
+
kwargs["eta"] = eta
|
| 95 |
+
return kwargs
|
| 96 |
+
|
| 97 |
model_cpu_offload_seq = "transformer->vae"
|
| 98 |
_optional_components = ["vae"]
|
| 99 |
|
|
|
|
| 111 |
self.labels = self._build_label2id(self._id2label)
|
| 112 |
self._labels_loaded_from_model_index = bool(self._id2label)
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
def _ensure_labels_loaded(self) -> None:
|
| 115 |
if self._labels_loaded_from_model_index:
|
| 116 |
return
|
|
|
|
| 261 |
)
|
| 262 |
return packed_latents, image_sizes
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
def _apply_classifier_free_guidance(
|
| 265 |
self,
|
| 266 |
model_output: torch.Tensor,
|
|
|
|
| 324 |
guidance_scale (`float`, defaults to `1.0`):
|
| 325 |
Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
|
| 326 |
guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
|
| 327 |
+
Flow-time interval where CFG is applied.
|
|
|
|
| 328 |
generator (`torch.Generator`, *optional*):
|
| 329 |
RNG for reproducibility.
|
| 330 |
output_type (`str`, defaults to `"pil"`):
|
|
|
|
| 337 |
width = int(width or default_size)
|
| 338 |
self.check_inputs(height, width, num_inference_steps, output_type)
|
| 339 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
device = self._execution_device
|
| 341 |
model_dtype = next(self.transformer.parameters()).dtype
|
| 342 |
class_labels_tensor = self._normalize_class_labels(class_labels)
|
|
|
|
| 348 |
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
| 349 |
num_train_timesteps = self.scheduler.config.num_train_timesteps
|
| 350 |
|
| 351 |
+
if getattr(self.scheduler.config, "stochastic_sampling", False):
|
| 352 |
+
raise ValueError(
|
| 353 |
+
"NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
|
| 354 |
+
"(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
|
| 355 |
+
"path uses a different update rule than the official NiT Euler-Maruyama SDE and "
|
| 356 |
+
"produces salt-and-pepper noise."
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
|
| 360 |
guidance_low, guidance_high = guidance_interval
|
| 361 |
|
| 362 |
for t in self.progress_bar(self.scheduler.timesteps):
|
| 363 |
+
flow_time = float(t) / num_train_timesteps
|
| 364 |
guidance_active = guidance_low <= flow_time <= guidance_high
|
| 365 |
if guidance_scale > 1.0 and guidance_active:
|
| 366 |
model_input = torch.cat([packed_latents, packed_latents], dim=0)
|
|
|
|
| 395 |
return (image,)
|
| 396 |
return ImagePipelineOutput(images=image)
|
| 397 |
|
| 398 |
+
NiTPipelineOutput = ImagePipelineOutput
|
|
|
NiT-XL/pipeline.py
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Copyright 2026 The HuggingFace Team. All rights reserved.
|
| 2 |
#
|
| 3 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
@@ -14,25 +22,22 @@
|
|
| 14 |
|
| 15 |
import json
|
| 16 |
from pathlib import Path
|
| 17 |
-
from typing import Dict, List, Optional, Tuple, Union
|
| 18 |
|
| 19 |
import torch
|
| 20 |
|
| 21 |
from diffusers.image_processor import VaeImageProcessor
|
| 22 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
|
| 23 |
-
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
|
| 24 |
from diffusers.utils.torch_utils import randn_tensor
|
| 25 |
|
| 26 |
-
# Local component classes are loaded dynamically in from_pretrained.
|
| 27 |
-
|
| 28 |
DEFAULT_NATIVE_RESOLUTION = 512
|
| 29 |
|
| 30 |
EXAMPLE_DOC_STRING = """
|
| 31 |
Examples:
|
| 32 |
```py
|
| 33 |
>>> from pathlib import Path
|
| 34 |
-
>>> import torch
|
| 35 |
>>> from diffusers import DiffusionPipeline
|
|
|
|
| 36 |
|
| 37 |
>>> model_dir = Path("./NiT-XL").resolve()
|
| 38 |
>>> pipe = DiffusionPipeline.from_pretrained(
|
|
@@ -57,32 +62,38 @@ EXAMPLE_DOC_STRING = """
|
|
| 57 |
... guidance_interval=(0.0, 0.7),
|
| 58 |
... generator=generator,
|
| 59 |
... ).images[0]
|
| 60 |
-
>>> image.save("demo.png")
|
| 61 |
```
|
| 62 |
"""
|
| 63 |
|
| 64 |
-
|
| 65 |
class NiTPipeline(DiffusionPipeline):
|
| 66 |
r"""
|
| 67 |
Pipeline for native-resolution class-conditional image generation with NiT.
|
| 68 |
|
| 69 |
-
Uses the native [`FlowMatchEulerDiscreteScheduler`] in deterministic (ODE) mode.
|
| 70 |
-
The official NiT repo defaults to an Euler-Maruyama SDE sampler for 512×512; that SDE is
|
| 71 |
-
not the same as the scheduler's `stochastic_sampling` path, so keep
|
| 72 |
-
`scheduler.config.stochastic_sampling=False` and let the scheduler perform the ODE update
|
| 73 |
-
`x_{t+dt} = x_t + dt * v`.
|
| 74 |
-
|
| 75 |
Parameters:
|
| 76 |
transformer ([`NiTTransformer2DModel`]):
|
| 77 |
Class-conditional transformer that predicts flow-matching velocity in packed latent space.
|
| 78 |
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
| 79 |
-
|
| 80 |
vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
|
| 81 |
Variational autoencoder used to decode packed transformer latents to pixels.
|
| 82 |
id2label (`dict[int, str]`, *optional*):
|
| 83 |
ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
|
| 84 |
"""
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
model_cpu_offload_seq = "transformer->vae"
|
| 87 |
_optional_components = ["vae"]
|
| 88 |
|
|
@@ -100,95 +111,6 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 100 |
self.labels = self._build_label2id(self._id2label)
|
| 101 |
self._labels_loaded_from_model_index = bool(self._id2label)
|
| 102 |
|
| 103 |
-
@classmethod
|
| 104 |
-
def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
|
| 105 |
-
"""Load a self-contained variant folder locally or from the Hub."""
|
| 106 |
-
import importlib
|
| 107 |
-
import sys
|
| 108 |
-
|
| 109 |
-
repo_root = Path(__file__).resolve().parent
|
| 110 |
-
|
| 111 |
-
if pretrained_model_name_or_path in (None, "", "."):
|
| 112 |
-
variant = repo_root
|
| 113 |
-
elif (
|
| 114 |
-
isinstance(pretrained_model_name_or_path, str)
|
| 115 |
-
and "/" in pretrained_model_name_or_path
|
| 116 |
-
and not Path(pretrained_model_name_or_path).exists()
|
| 117 |
-
):
|
| 118 |
-
from huggingface_hub import snapshot_download
|
| 119 |
-
|
| 120 |
-
hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
|
| 121 |
-
if subfolder:
|
| 122 |
-
hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
|
| 123 |
-
cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
|
| 124 |
-
variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
|
| 125 |
-
else:
|
| 126 |
-
variant = Path(pretrained_model_name_or_path)
|
| 127 |
-
if not variant.is_absolute():
|
| 128 |
-
candidate = (Path.cwd() / variant).resolve()
|
| 129 |
-
variant = candidate if candidate.exists() else (repo_root / variant).resolve()
|
| 130 |
-
if subfolder:
|
| 131 |
-
variant = variant / subfolder
|
| 132 |
-
|
| 133 |
-
id2label_override = kwargs.pop("id2label", None)
|
| 134 |
-
model_kwargs = dict(kwargs)
|
| 135 |
-
inserted: List[str] = []
|
| 136 |
-
|
| 137 |
-
def _load_component(folder: str, module_name: str, class_name: str):
|
| 138 |
-
comp_dir = variant / folder
|
| 139 |
-
module_path = comp_dir / f"{module_name}.py"
|
| 140 |
-
has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
|
| 141 |
-
if not module_path.exists() or not has_weights:
|
| 142 |
-
return None
|
| 143 |
-
|
| 144 |
-
comp_path = str(comp_dir)
|
| 145 |
-
if comp_path not in sys.path:
|
| 146 |
-
sys.path.insert(0, comp_path)
|
| 147 |
-
inserted.append(comp_path)
|
| 148 |
-
|
| 149 |
-
module = importlib.import_module(module_name)
|
| 150 |
-
component_cls = getattr(module, class_name)
|
| 151 |
-
return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
|
| 152 |
-
|
| 153 |
-
try:
|
| 154 |
-
transformer = _load_component("transformer", "nit_transformer_2d", "NiTTransformer2DModel")
|
| 155 |
-
try:
|
| 156 |
-
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
|
| 157 |
-
except Exception:
|
| 158 |
-
scheduler = FlowMatchEulerDiscreteScheduler(
|
| 159 |
-
num_train_timesteps=1000,
|
| 160 |
-
shift=1.0,
|
| 161 |
-
stochastic_sampling=False,
|
| 162 |
-
)
|
| 163 |
-
if transformer is None:
|
| 164 |
-
raise ValueError(f"No loadable transformer found under {variant}")
|
| 165 |
-
|
| 166 |
-
vae = None
|
| 167 |
-
vae_dir = variant / "vae"
|
| 168 |
-
if vae_dir.exists() and (vae_dir / "config.json").exists():
|
| 169 |
-
from diffusers import AutoencoderDC, AutoencoderKL
|
| 170 |
-
|
| 171 |
-
vae_class_name = json.loads((vae_dir / "config.json").read_text(encoding="utf-8")).get(
|
| 172 |
-
"_class_name", "AutoencoderDC"
|
| 173 |
-
)
|
| 174 |
-
vae_cls = AutoencoderDC if vae_class_name == "AutoencoderDC" else AutoencoderKL
|
| 175 |
-
vae = vae_cls.from_pretrained(str(vae_dir), **model_kwargs)
|
| 176 |
-
|
| 177 |
-
id2label = id2label_override or cls._read_id2label_from_model_index(str(variant))
|
| 178 |
-
pipe = cls(
|
| 179 |
-
transformer=transformer,
|
| 180 |
-
scheduler=scheduler,
|
| 181 |
-
vae=vae,
|
| 182 |
-
id2label=id2label,
|
| 183 |
-
)
|
| 184 |
-
if hasattr(pipe, "register_to_config"):
|
| 185 |
-
pipe.register_to_config(_name_or_path=str(variant))
|
| 186 |
-
return pipe
|
| 187 |
-
finally:
|
| 188 |
-
for comp_path in inserted:
|
| 189 |
-
if comp_path in sys.path:
|
| 190 |
-
sys.path.remove(comp_path)
|
| 191 |
-
|
| 192 |
def _ensure_labels_loaded(self) -> None:
|
| 193 |
if self._labels_loaded_from_model_index:
|
| 194 |
return
|
|
@@ -339,11 +261,6 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 339 |
)
|
| 340 |
return packed_latents, image_sizes
|
| 341 |
|
| 342 |
-
@staticmethod
|
| 343 |
-
def _flow_time_from_scheduler_timestep(timestep: torch.Tensor, num_train_timesteps: int) -> float:
|
| 344 |
-
"""Map native scheduler timesteps (sigma * num_train_timesteps) to NiT flow time in [0, 1]."""
|
| 345 |
-
return float(timestep) / num_train_timesteps
|
| 346 |
-
|
| 347 |
def _apply_classifier_free_guidance(
|
| 348 |
self,
|
| 349 |
model_output: torch.Tensor,
|
|
@@ -407,8 +324,7 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 407 |
guidance_scale (`float`, defaults to `1.0`):
|
| 408 |
Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
|
| 409 |
guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
|
| 410 |
-
Flow-time interval where CFG is applied.
|
| 411 |
-
`timestep / num_train_timesteps`, matching the official NiT ODE sampler.
|
| 412 |
generator (`torch.Generator`, *optional*):
|
| 413 |
RNG for reproducibility.
|
| 414 |
output_type (`str`, defaults to `"pil"`):
|
|
@@ -421,14 +337,6 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 421 |
width = int(width or default_size)
|
| 422 |
self.check_inputs(height, width, num_inference_steps, output_type)
|
| 423 |
|
| 424 |
-
if getattr(self.scheduler.config, "stochastic_sampling", False):
|
| 425 |
-
raise ValueError(
|
| 426 |
-
"NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
|
| 427 |
-
"(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
|
| 428 |
-
"path uses a different update rule than the official NiT Euler-Maruyama SDE and "
|
| 429 |
-
"produces salt-and-pepper noise."
|
| 430 |
-
)
|
| 431 |
-
|
| 432 |
device = self._execution_device
|
| 433 |
model_dtype = next(self.transformer.parameters()).dtype
|
| 434 |
class_labels_tensor = self._normalize_class_labels(class_labels)
|
|
@@ -440,11 +348,19 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 440 |
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
| 441 |
num_train_timesteps = self.scheduler.config.num_train_timesteps
|
| 442 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
|
| 444 |
guidance_low, guidance_high = guidance_interval
|
| 445 |
|
| 446 |
for t in self.progress_bar(self.scheduler.timesteps):
|
| 447 |
-
flow_time =
|
| 448 |
guidance_active = guidance_low <= flow_time <= guidance_high
|
| 449 |
if guidance_scale > 1.0 and guidance_active:
|
| 450 |
model_input = torch.cat([packed_latents, packed_latents], dim=0)
|
|
@@ -479,5 +395,4 @@ class NiTPipeline(DiffusionPipeline):
|
|
| 479 |
return (image,)
|
| 480 |
return ImagePipelineOutput(images=image)
|
| 481 |
|
| 482 |
-
|
| 483 |
-
NiTPipelineOutput = ImagePipelineOutput
|
|
|
|
| 1 |
+
"""Hub custom pipeline: NiTPipeline.
|
| 2 |
+
Load with native Hugging Face diffusers and trust_remote_code=True.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import inspect
|
| 8 |
+
|
| 9 |
# Copyright 2026 The HuggingFace Team. All rights reserved.
|
| 10 |
#
|
| 11 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
| 22 |
|
| 23 |
import json
|
| 24 |
from pathlib import Path
|
| 25 |
+
from typing import Dict, List, Optional, Tuple, Union, Any
|
| 26 |
|
| 27 |
import torch
|
| 28 |
|
| 29 |
from diffusers.image_processor import VaeImageProcessor
|
| 30 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
|
|
|
|
| 31 |
from diffusers.utils.torch_utils import randn_tensor
|
| 32 |
|
|
|
|
|
|
|
| 33 |
DEFAULT_NATIVE_RESOLUTION = 512
|
| 34 |
|
| 35 |
EXAMPLE_DOC_STRING = """
|
| 36 |
Examples:
|
| 37 |
```py
|
| 38 |
>>> from pathlib import Path
|
|
|
|
| 39 |
>>> from diffusers import DiffusionPipeline
|
| 40 |
+
>>> import torch
|
| 41 |
|
| 42 |
>>> model_dir = Path("./NiT-XL").resolve()
|
| 43 |
>>> pipe = DiffusionPipeline.from_pretrained(
|
|
|
|
| 62 |
... guidance_interval=(0.0, 0.7),
|
| 63 |
... generator=generator,
|
| 64 |
... ).images[0]
|
|
|
|
| 65 |
```
|
| 66 |
"""
|
| 67 |
|
|
|
|
| 68 |
class NiTPipeline(DiffusionPipeline):
|
| 69 |
r"""
|
| 70 |
Pipeline for native-resolution class-conditional image generation with NiT.
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
Parameters:
|
| 73 |
transformer ([`NiTTransformer2DModel`]):
|
| 74 |
Class-conditional transformer that predicts flow-matching velocity in packed latent space.
|
| 75 |
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
| 76 |
+
Flow-matching Euler scheduler used by NiT.
|
| 77 |
vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
|
| 78 |
Variational autoencoder used to decode packed transformer latents to pixels.
|
| 79 |
id2label (`dict[int, str]`, *optional*):
|
| 80 |
ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
|
| 81 |
"""
|
| 82 |
|
| 83 |
+
@staticmethod
|
| 84 |
+
def prepare_extra_step_kwargs(
|
| 85 |
+
scheduler,
|
| 86 |
+
generator=None,
|
| 87 |
+
eta: float | None = None,
|
| 88 |
+
):
|
| 89 |
+
kwargs = {}
|
| 90 |
+
step_params = set(inspect.signature(scheduler.step).parameters.keys())
|
| 91 |
+
if "generator" in step_params:
|
| 92 |
+
kwargs["generator"] = generator
|
| 93 |
+
if eta is not None and "eta" in step_params:
|
| 94 |
+
kwargs["eta"] = eta
|
| 95 |
+
return kwargs
|
| 96 |
+
|
| 97 |
model_cpu_offload_seq = "transformer->vae"
|
| 98 |
_optional_components = ["vae"]
|
| 99 |
|
|
|
|
| 111 |
self.labels = self._build_label2id(self._id2label)
|
| 112 |
self._labels_loaded_from_model_index = bool(self._id2label)
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
def _ensure_labels_loaded(self) -> None:
|
| 115 |
if self._labels_loaded_from_model_index:
|
| 116 |
return
|
|
|
|
| 261 |
)
|
| 262 |
return packed_latents, image_sizes
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
def _apply_classifier_free_guidance(
|
| 265 |
self,
|
| 266 |
model_output: torch.Tensor,
|
|
|
|
| 324 |
guidance_scale (`float`, defaults to `1.0`):
|
| 325 |
Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
|
| 326 |
guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
|
| 327 |
+
Flow-time interval where CFG is applied.
|
|
|
|
| 328 |
generator (`torch.Generator`, *optional*):
|
| 329 |
RNG for reproducibility.
|
| 330 |
output_type (`str`, defaults to `"pil"`):
|
|
|
|
| 337 |
width = int(width or default_size)
|
| 338 |
self.check_inputs(height, width, num_inference_steps, output_type)
|
| 339 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
device = self._execution_device
|
| 341 |
model_dtype = next(self.transformer.parameters()).dtype
|
| 342 |
class_labels_tensor = self._normalize_class_labels(class_labels)
|
|
|
|
| 348 |
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
| 349 |
num_train_timesteps = self.scheduler.config.num_train_timesteps
|
| 350 |
|
| 351 |
+
if getattr(self.scheduler.config, "stochastic_sampling", False):
|
| 352 |
+
raise ValueError(
|
| 353 |
+
"NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
|
| 354 |
+
"(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
|
| 355 |
+
"path uses a different update rule than the official NiT Euler-Maruyama SDE and "
|
| 356 |
+
"produces salt-and-pepper noise."
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
|
| 360 |
guidance_low, guidance_high = guidance_interval
|
| 361 |
|
| 362 |
for t in self.progress_bar(self.scheduler.timesteps):
|
| 363 |
+
flow_time = float(t) / num_train_timesteps
|
| 364 |
guidance_active = guidance_low <= flow_time <= guidance_high
|
| 365 |
if guidance_scale > 1.0 and guidance_active:
|
| 366 |
model_input = torch.cat([packed_latents, packed_latents], dim=0)
|
|
|
|
| 395 |
return (image,)
|
| 396 |
return ImagePipelineOutput(images=image)
|
| 397 |
|
| 398 |
+
NiTPipelineOutput = ImagePipelineOutput
|
|
|