BiliSakura commited on
Commit
5423eac
·
verified ·
1 Parent(s): 3d7e8b9

Fix generator determinism: forward generator through scheduler steps and seeded noise

Browse files
Files changed (4) hide show
  1. NiT-B/pipeline.py +41 -126
  2. NiT-L/pipeline.py +37 -122
  3. NiT-S/pipeline.py +41 -126
  4. NiT-XL/pipeline.py +36 -121
NiT-B/pipeline.py CHANGED
@@ -1,3 +1,11 @@
 
 
 
 
 
 
 
 
1
  # Copyright 2026 The HuggingFace Team. All rights reserved.
2
  #
3
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,27 +22,24 @@
14
 
15
  import json
16
  from pathlib import Path
17
- from typing import Dict, List, Optional, Tuple, Union
18
 
19
  import torch
20
 
21
  from diffusers.image_processor import VaeImageProcessor
22
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
23
- from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
24
  from diffusers.utils.torch_utils import randn_tensor
25
 
26
- # Local component classes are loaded dynamically in from_pretrained.
27
-
28
- DEFAULT_NATIVE_RESOLUTION = 256
29
 
30
  EXAMPLE_DOC_STRING = """
31
  Examples:
32
  ```py
33
  >>> from pathlib import Path
34
- >>> import torch
35
  >>> from diffusers import DiffusionPipeline
 
36
 
37
- >>> model_dir = Path("./NiT-B").resolve()
38
  >>> pipe = DiffusionPipeline.from_pretrained(
39
  ... str(model_dir),
40
  ... local_files_only=True,
@@ -50,39 +55,45 @@ EXAMPLE_DOC_STRING = """
50
  >>> generator = torch.Generator(device="cuda").manual_seed(42)
51
  >>> image = pipe(
52
  ... class_labels="golden retriever",
53
- ... height=256,
54
- ... width=256,
55
  ... num_inference_steps=250,
56
- ... guidance_scale=2.25,
57
  ... guidance_interval=(0.0, 0.7),
58
  ... generator=generator,
59
  ... ).images[0]
60
- >>> image.save("demo.png")
61
  ```
62
  """
63
 
64
-
65
  class NiTPipeline(DiffusionPipeline):
66
  r"""
67
  Pipeline for native-resolution class-conditional image generation with NiT.
68
 
69
- Uses the native [`FlowMatchEulerDiscreteScheduler`] in deterministic (ODE) mode.
70
- The official NiT repo defaults to an Euler-Maruyama SDE sampler for 512×512; that SDE is
71
- not the same as the scheduler's `stochastic_sampling` path, so keep
72
- `scheduler.config.stochastic_sampling=False` and let the scheduler perform the ODE update
73
- `x_{t+dt} = x_t + dt * v`.
74
-
75
  Parameters:
76
  transformer ([`NiTTransformer2DModel`]):
77
  Class-conditional transformer that predicts flow-matching velocity in packed latent space.
78
  scheduler ([`FlowMatchEulerDiscreteScheduler`]):
79
- Native diffusers flow-matching Euler scheduler (`stochastic_sampling=False`).
80
  vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
81
  Variational autoencoder used to decode packed transformer latents to pixels.
82
  id2label (`dict[int, str]`, *optional*):
83
  ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
84
  """
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  model_cpu_offload_seq = "transformer->vae"
87
  _optional_components = ["vae"]
88
 
@@ -100,95 +111,6 @@ class NiTPipeline(DiffusionPipeline):
100
  self.labels = self._build_label2id(self._id2label)
101
  self._labels_loaded_from_model_index = bool(self._id2label)
102
 
103
- @classmethod
104
- def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
105
- """Load a self-contained variant folder locally or from the Hub."""
106
- import importlib
107
- import sys
108
-
109
- repo_root = Path(__file__).resolve().parent
110
-
111
- if pretrained_model_name_or_path in (None, "", "."):
112
- variant = repo_root
113
- elif (
114
- isinstance(pretrained_model_name_or_path, str)
115
- and "/" in pretrained_model_name_or_path
116
- and not Path(pretrained_model_name_or_path).exists()
117
- ):
118
- from huggingface_hub import snapshot_download
119
-
120
- hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
121
- if subfolder:
122
- hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
123
- cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
124
- variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
125
- else:
126
- variant = Path(pretrained_model_name_or_path)
127
- if not variant.is_absolute():
128
- candidate = (Path.cwd() / variant).resolve()
129
- variant = candidate if candidate.exists() else (repo_root / variant).resolve()
130
- if subfolder:
131
- variant = variant / subfolder
132
-
133
- id2label_override = kwargs.pop("id2label", None)
134
- model_kwargs = dict(kwargs)
135
- inserted: List[str] = []
136
-
137
- def _load_component(folder: str, module_name: str, class_name: str):
138
- comp_dir = variant / folder
139
- module_path = comp_dir / f"{module_name}.py"
140
- has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
141
- if not module_path.exists() or not has_weights:
142
- return None
143
-
144
- comp_path = str(comp_dir)
145
- if comp_path not in sys.path:
146
- sys.path.insert(0, comp_path)
147
- inserted.append(comp_path)
148
-
149
- module = importlib.import_module(module_name)
150
- component_cls = getattr(module, class_name)
151
- return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
152
-
153
- try:
154
- transformer = _load_component("transformer", "nit_transformer_2d", "NiTTransformer2DModel")
155
- try:
156
- scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
157
- except Exception:
158
- scheduler = FlowMatchEulerDiscreteScheduler(
159
- num_train_timesteps=1000,
160
- shift=1.0,
161
- stochastic_sampling=False,
162
- )
163
- if transformer is None:
164
- raise ValueError(f"No loadable transformer found under {variant}")
165
-
166
- vae = None
167
- vae_dir = variant / "vae"
168
- if vae_dir.exists() and (vae_dir / "config.json").exists():
169
- from diffusers import AutoencoderDC, AutoencoderKL
170
-
171
- vae_class_name = json.loads((vae_dir / "config.json").read_text(encoding="utf-8")).get(
172
- "_class_name", "AutoencoderDC"
173
- )
174
- vae_cls = AutoencoderDC if vae_class_name == "AutoencoderDC" else AutoencoderKL
175
- vae = vae_cls.from_pretrained(str(vae_dir), **model_kwargs)
176
-
177
- id2label = id2label_override or cls._read_id2label_from_model_index(str(variant))
178
- pipe = cls(
179
- transformer=transformer,
180
- scheduler=scheduler,
181
- vae=vae,
182
- id2label=id2label,
183
- )
184
- if hasattr(pipe, "register_to_config"):
185
- pipe.register_to_config(_name_or_path=str(variant))
186
- return pipe
187
- finally:
188
- for comp_path in inserted:
189
- if comp_path in sys.path:
190
- sys.path.remove(comp_path)
191
-
192
  def _ensure_labels_loaded(self) -> None:
193
  if self._labels_loaded_from_model_index:
194
  return
@@ -339,11 +261,6 @@ class NiTPipeline(DiffusionPipeline):
339
  )
340
  return packed_latents, image_sizes
341
 
342
- @staticmethod
343
- def _flow_time_from_scheduler_timestep(timestep: torch.Tensor, num_train_timesteps: int) -> float:
344
- """Map native scheduler timesteps (sigma * num_train_timesteps) to NiT flow time in [0, 1]."""
345
- return float(timestep) / num_train_timesteps
346
-
347
  def _apply_classifier_free_guidance(
348
  self,
349
  model_output: torch.Tensor,
@@ -407,8 +324,7 @@ class NiTPipeline(DiffusionPipeline):
407
  guidance_scale (`float`, defaults to `1.0`):
408
  Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
409
  guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
410
- Flow-time interval where CFG is applied. Uses continuous flow time
411
- `timestep / num_train_timesteps`, matching the official NiT ODE sampler.
412
  generator (`torch.Generator`, *optional*):
413
  RNG for reproducibility.
414
  output_type (`str`, defaults to `"pil"`):
@@ -421,14 +337,6 @@ class NiTPipeline(DiffusionPipeline):
421
  width = int(width or default_size)
422
  self.check_inputs(height, width, num_inference_steps, output_type)
423
 
424
- if getattr(self.scheduler.config, "stochastic_sampling", False):
425
- raise ValueError(
426
- "NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
427
- "(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
428
- "path uses a different update rule than the official NiT Euler-Maruyama SDE and "
429
- "produces salt-and-pepper noise."
430
- )
431
-
432
  device = self._execution_device
433
  model_dtype = next(self.transformer.parameters()).dtype
434
  class_labels_tensor = self._normalize_class_labels(class_labels)
@@ -440,11 +348,19 @@ class NiTPipeline(DiffusionPipeline):
440
  self.scheduler.set_timesteps(num_inference_steps, device=device)
441
  num_train_timesteps = self.scheduler.config.num_train_timesteps
442
 
 
 
 
 
 
 
 
 
443
  null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
444
  guidance_low, guidance_high = guidance_interval
445
 
446
  for t in self.progress_bar(self.scheduler.timesteps):
447
- flow_time = self._flow_time_from_scheduler_timestep(t, num_train_timesteps)
448
  guidance_active = guidance_low <= flow_time <= guidance_high
449
  if guidance_scale > 1.0 and guidance_active:
450
  model_input = torch.cat([packed_latents, packed_latents], dim=0)
@@ -479,5 +395,4 @@ class NiTPipeline(DiffusionPipeline):
479
  return (image,)
480
  return ImagePipelineOutput(images=image)
481
 
482
-
483
- NiTPipelineOutput = ImagePipelineOutput
 
1
+ """Hub custom pipeline: NiTPipeline.
2
+ Load with native Hugging Face diffusers and trust_remote_code=True.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import inspect
8
+
9
  # Copyright 2026 The HuggingFace Team. All rights reserved.
10
  #
11
  # Licensed under the Apache License, Version 2.0 (the "License");
 
22
 
23
  import json
24
  from pathlib import Path
25
+ from typing import Dict, List, Optional, Tuple, Union, Any
26
 
27
  import torch
28
 
29
  from diffusers.image_processor import VaeImageProcessor
30
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
31
  from diffusers.utils.torch_utils import randn_tensor
32
 
33
+ DEFAULT_NATIVE_RESOLUTION = 512
 
 
34
 
35
  EXAMPLE_DOC_STRING = """
36
  Examples:
37
  ```py
38
  >>> from pathlib import Path
 
39
  >>> from diffusers import DiffusionPipeline
40
+ >>> import torch
41
 
42
+ >>> model_dir = Path("./NiT-XL").resolve()
43
  >>> pipe = DiffusionPipeline.from_pretrained(
44
  ... str(model_dir),
45
  ... local_files_only=True,
 
55
  >>> generator = torch.Generator(device="cuda").manual_seed(42)
56
  >>> image = pipe(
57
  ... class_labels="golden retriever",
58
+ ... height=512,
59
+ ... width=512,
60
  ... num_inference_steps=250,
61
+ ... guidance_scale=2.05,
62
  ... guidance_interval=(0.0, 0.7),
63
  ... generator=generator,
64
  ... ).images[0]
 
65
  ```
66
  """
67
 
 
68
  class NiTPipeline(DiffusionPipeline):
69
  r"""
70
  Pipeline for native-resolution class-conditional image generation with NiT.
71
 
 
 
 
 
 
 
72
  Parameters:
73
  transformer ([`NiTTransformer2DModel`]):
74
  Class-conditional transformer that predicts flow-matching velocity in packed latent space.
75
  scheduler ([`FlowMatchEulerDiscreteScheduler`]):
76
+ Flow-matching Euler scheduler used by NiT.
77
  vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
78
  Variational autoencoder used to decode packed transformer latents to pixels.
79
  id2label (`dict[int, str]`, *optional*):
80
  ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
81
  """
82
 
83
+ @staticmethod
84
+ def prepare_extra_step_kwargs(
85
+ scheduler,
86
+ generator=None,
87
+ eta: float | None = None,
88
+ ):
89
+ kwargs = {}
90
+ step_params = set(inspect.signature(scheduler.step).parameters.keys())
91
+ if "generator" in step_params:
92
+ kwargs["generator"] = generator
93
+ if eta is not None and "eta" in step_params:
94
+ kwargs["eta"] = eta
95
+ return kwargs
96
+
97
  model_cpu_offload_seq = "transformer->vae"
98
  _optional_components = ["vae"]
99
 
 
111
  self.labels = self._build_label2id(self._id2label)
112
  self._labels_loaded_from_model_index = bool(self._id2label)
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  def _ensure_labels_loaded(self) -> None:
115
  if self._labels_loaded_from_model_index:
116
  return
 
261
  )
262
  return packed_latents, image_sizes
263
 
 
 
 
 
 
264
  def _apply_classifier_free_guidance(
265
  self,
266
  model_output: torch.Tensor,
 
324
  guidance_scale (`float`, defaults to `1.0`):
325
  Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
326
  guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
327
+ Flow-time interval where CFG is applied.
 
328
  generator (`torch.Generator`, *optional*):
329
  RNG for reproducibility.
330
  output_type (`str`, defaults to `"pil"`):
 
337
  width = int(width or default_size)
338
  self.check_inputs(height, width, num_inference_steps, output_type)
339
 
 
 
 
 
 
 
 
 
340
  device = self._execution_device
341
  model_dtype = next(self.transformer.parameters()).dtype
342
  class_labels_tensor = self._normalize_class_labels(class_labels)
 
348
  self.scheduler.set_timesteps(num_inference_steps, device=device)
349
  num_train_timesteps = self.scheduler.config.num_train_timesteps
350
 
351
+ if getattr(self.scheduler.config, "stochastic_sampling", False):
352
+ raise ValueError(
353
+ "NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
354
+ "(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
355
+ "path uses a different update rule than the official NiT Euler-Maruyama SDE and "
356
+ "produces salt-and-pepper noise."
357
+ )
358
+
359
  null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
360
  guidance_low, guidance_high = guidance_interval
361
 
362
  for t in self.progress_bar(self.scheduler.timesteps):
363
+ flow_time = float(t) / num_train_timesteps
364
  guidance_active = guidance_low <= flow_time <= guidance_high
365
  if guidance_scale > 1.0 and guidance_active:
366
  model_input = torch.cat([packed_latents, packed_latents], dim=0)
 
395
  return (image,)
396
  return ImagePipelineOutput(images=image)
397
 
398
+ NiTPipelineOutput = ImagePipelineOutput
 
NiT-L/pipeline.py CHANGED
@@ -1,3 +1,11 @@
 
 
 
 
 
 
 
 
1
  # Copyright 2026 The HuggingFace Team. All rights reserved.
2
  #
3
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,27 +22,24 @@
14
 
15
  import json
16
  from pathlib import Path
17
- from typing import Dict, List, Optional, Tuple, Union
18
 
19
  import torch
20
 
21
  from diffusers.image_processor import VaeImageProcessor
22
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
23
- from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
24
  from diffusers.utils.torch_utils import randn_tensor
25
 
26
- # Local component classes are loaded dynamically in from_pretrained.
27
-
28
  DEFAULT_NATIVE_RESOLUTION = 512
29
 
30
  EXAMPLE_DOC_STRING = """
31
  Examples:
32
  ```py
33
  >>> from pathlib import Path
34
- >>> import torch
35
  >>> from diffusers import DiffusionPipeline
 
36
 
37
- >>> model_dir = Path("./NiT-L").resolve()
38
  >>> pipe = DiffusionPipeline.from_pretrained(
39
  ... str(model_dir),
40
  ... local_files_only=True,
@@ -57,32 +62,38 @@ EXAMPLE_DOC_STRING = """
57
  ... guidance_interval=(0.0, 0.7),
58
  ... generator=generator,
59
  ... ).images[0]
60
- >>> image.save("demo.png")
61
  ```
62
  """
63
 
64
-
65
  class NiTPipeline(DiffusionPipeline):
66
  r"""
67
  Pipeline for native-resolution class-conditional image generation with NiT.
68
 
69
- Uses the native [`FlowMatchEulerDiscreteScheduler`] in deterministic (ODE) mode.
70
- The official NiT repo defaults to an Euler-Maruyama SDE sampler for 512×512; that SDE is
71
- not the same as the scheduler's `stochastic_sampling` path, so keep
72
- `scheduler.config.stochastic_sampling=False` and let the scheduler perform the ODE update
73
- `x_{t+dt} = x_t + dt * v`.
74
-
75
  Parameters:
76
  transformer ([`NiTTransformer2DModel`]):
77
  Class-conditional transformer that predicts flow-matching velocity in packed latent space.
78
  scheduler ([`FlowMatchEulerDiscreteScheduler`]):
79
- Native diffusers flow-matching Euler scheduler (`stochastic_sampling=False`).
80
  vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
81
  Variational autoencoder used to decode packed transformer latents to pixels.
82
  id2label (`dict[int, str]`, *optional*):
83
  ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
84
  """
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  model_cpu_offload_seq = "transformer->vae"
87
  _optional_components = ["vae"]
88
 
@@ -100,95 +111,6 @@ class NiTPipeline(DiffusionPipeline):
100
  self.labels = self._build_label2id(self._id2label)
101
  self._labels_loaded_from_model_index = bool(self._id2label)
102
 
103
- @classmethod
104
- def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
105
- """Load a self-contained variant folder locally or from the Hub."""
106
- import importlib
107
- import sys
108
-
109
- repo_root = Path(__file__).resolve().parent
110
-
111
- if pretrained_model_name_or_path in (None, "", "."):
112
- variant = repo_root
113
- elif (
114
- isinstance(pretrained_model_name_or_path, str)
115
- and "/" in pretrained_model_name_or_path
116
- and not Path(pretrained_model_name_or_path).exists()
117
- ):
118
- from huggingface_hub import snapshot_download
119
-
120
- hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
121
- if subfolder:
122
- hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
123
- cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
124
- variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
125
- else:
126
- variant = Path(pretrained_model_name_or_path)
127
- if not variant.is_absolute():
128
- candidate = (Path.cwd() / variant).resolve()
129
- variant = candidate if candidate.exists() else (repo_root / variant).resolve()
130
- if subfolder:
131
- variant = variant / subfolder
132
-
133
- id2label_override = kwargs.pop("id2label", None)
134
- model_kwargs = dict(kwargs)
135
- inserted: List[str] = []
136
-
137
- def _load_component(folder: str, module_name: str, class_name: str):
138
- comp_dir = variant / folder
139
- module_path = comp_dir / f"{module_name}.py"
140
- has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
141
- if not module_path.exists() or not has_weights:
142
- return None
143
-
144
- comp_path = str(comp_dir)
145
- if comp_path not in sys.path:
146
- sys.path.insert(0, comp_path)
147
- inserted.append(comp_path)
148
-
149
- module = importlib.import_module(module_name)
150
- component_cls = getattr(module, class_name)
151
- return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
152
-
153
- try:
154
- transformer = _load_component("transformer", "nit_transformer_2d", "NiTTransformer2DModel")
155
- try:
156
- scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
157
- except Exception:
158
- scheduler = FlowMatchEulerDiscreteScheduler(
159
- num_train_timesteps=1000,
160
- shift=1.0,
161
- stochastic_sampling=False,
162
- )
163
- if transformer is None:
164
- raise ValueError(f"No loadable transformer found under {variant}")
165
-
166
- vae = None
167
- vae_dir = variant / "vae"
168
- if vae_dir.exists() and (vae_dir / "config.json").exists():
169
- from diffusers import AutoencoderDC, AutoencoderKL
170
-
171
- vae_class_name = json.loads((vae_dir / "config.json").read_text(encoding="utf-8")).get(
172
- "_class_name", "AutoencoderDC"
173
- )
174
- vae_cls = AutoencoderDC if vae_class_name == "AutoencoderDC" else AutoencoderKL
175
- vae = vae_cls.from_pretrained(str(vae_dir), **model_kwargs)
176
-
177
- id2label = id2label_override or cls._read_id2label_from_model_index(str(variant))
178
- pipe = cls(
179
- transformer=transformer,
180
- scheduler=scheduler,
181
- vae=vae,
182
- id2label=id2label,
183
- )
184
- if hasattr(pipe, "register_to_config"):
185
- pipe.register_to_config(_name_or_path=str(variant))
186
- return pipe
187
- finally:
188
- for comp_path in inserted:
189
- if comp_path in sys.path:
190
- sys.path.remove(comp_path)
191
-
192
  def _ensure_labels_loaded(self) -> None:
193
  if self._labels_loaded_from_model_index:
194
  return
@@ -339,11 +261,6 @@ class NiTPipeline(DiffusionPipeline):
339
  )
340
  return packed_latents, image_sizes
341
 
342
- @staticmethod
343
- def _flow_time_from_scheduler_timestep(timestep: torch.Tensor, num_train_timesteps: int) -> float:
344
- """Map native scheduler timesteps (sigma * num_train_timesteps) to NiT flow time in [0, 1]."""
345
- return float(timestep) / num_train_timesteps
346
-
347
  def _apply_classifier_free_guidance(
348
  self,
349
  model_output: torch.Tensor,
@@ -407,8 +324,7 @@ class NiTPipeline(DiffusionPipeline):
407
  guidance_scale (`float`, defaults to `1.0`):
408
  Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
409
  guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
410
- Flow-time interval where CFG is applied. Uses continuous flow time
411
- `timestep / num_train_timesteps`, matching the official NiT ODE sampler.
412
  generator (`torch.Generator`, *optional*):
413
  RNG for reproducibility.
414
  output_type (`str`, defaults to `"pil"`):
@@ -421,14 +337,6 @@ class NiTPipeline(DiffusionPipeline):
421
  width = int(width or default_size)
422
  self.check_inputs(height, width, num_inference_steps, output_type)
423
 
424
- if getattr(self.scheduler.config, "stochastic_sampling", False):
425
- raise ValueError(
426
- "NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
427
- "(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
428
- "path uses a different update rule than the official NiT Euler-Maruyama SDE and "
429
- "produces salt-and-pepper noise."
430
- )
431
-
432
  device = self._execution_device
433
  model_dtype = next(self.transformer.parameters()).dtype
434
  class_labels_tensor = self._normalize_class_labels(class_labels)
@@ -440,11 +348,19 @@ class NiTPipeline(DiffusionPipeline):
440
  self.scheduler.set_timesteps(num_inference_steps, device=device)
441
  num_train_timesteps = self.scheduler.config.num_train_timesteps
442
 
 
 
 
 
 
 
 
 
443
  null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
444
  guidance_low, guidance_high = guidance_interval
445
 
446
  for t in self.progress_bar(self.scheduler.timesteps):
447
- flow_time = self._flow_time_from_scheduler_timestep(t, num_train_timesteps)
448
  guidance_active = guidance_low <= flow_time <= guidance_high
449
  if guidance_scale > 1.0 and guidance_active:
450
  model_input = torch.cat([packed_latents, packed_latents], dim=0)
@@ -479,5 +395,4 @@ class NiTPipeline(DiffusionPipeline):
479
  return (image,)
480
  return ImagePipelineOutput(images=image)
481
 
482
-
483
- NiTPipelineOutput = ImagePipelineOutput
 
1
+ """Hub custom pipeline: NiTPipeline.
2
+ Load with native Hugging Face diffusers and trust_remote_code=True.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import inspect
8
+
9
  # Copyright 2026 The HuggingFace Team. All rights reserved.
10
  #
11
  # Licensed under the Apache License, Version 2.0 (the "License");
 
22
 
23
  import json
24
  from pathlib import Path
25
+ from typing import Dict, List, Optional, Tuple, Union, Any
26
 
27
  import torch
28
 
29
  from diffusers.image_processor import VaeImageProcessor
30
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
31
  from diffusers.utils.torch_utils import randn_tensor
32
 
 
 
33
  DEFAULT_NATIVE_RESOLUTION = 512
34
 
35
  EXAMPLE_DOC_STRING = """
36
  Examples:
37
  ```py
38
  >>> from pathlib import Path
 
39
  >>> from diffusers import DiffusionPipeline
40
+ >>> import torch
41
 
42
+ >>> model_dir = Path("./NiT-XL").resolve()
43
  >>> pipe = DiffusionPipeline.from_pretrained(
44
  ... str(model_dir),
45
  ... local_files_only=True,
 
62
  ... guidance_interval=(0.0, 0.7),
63
  ... generator=generator,
64
  ... ).images[0]
 
65
  ```
66
  """
67
 
 
68
  class NiTPipeline(DiffusionPipeline):
69
  r"""
70
  Pipeline for native-resolution class-conditional image generation with NiT.
71
 
 
 
 
 
 
 
72
  Parameters:
73
  transformer ([`NiTTransformer2DModel`]):
74
  Class-conditional transformer that predicts flow-matching velocity in packed latent space.
75
  scheduler ([`FlowMatchEulerDiscreteScheduler`]):
76
+ Flow-matching Euler scheduler used by NiT.
77
  vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
78
  Variational autoencoder used to decode packed transformer latents to pixels.
79
  id2label (`dict[int, str]`, *optional*):
80
  ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
81
  """
82
 
83
+ @staticmethod
84
+ def prepare_extra_step_kwargs(
85
+ scheduler,
86
+ generator=None,
87
+ eta: float | None = None,
88
+ ):
89
+ kwargs = {}
90
+ step_params = set(inspect.signature(scheduler.step).parameters.keys())
91
+ if "generator" in step_params:
92
+ kwargs["generator"] = generator
93
+ if eta is not None and "eta" in step_params:
94
+ kwargs["eta"] = eta
95
+ return kwargs
96
+
97
  model_cpu_offload_seq = "transformer->vae"
98
  _optional_components = ["vae"]
99
 
 
111
  self.labels = self._build_label2id(self._id2label)
112
  self._labels_loaded_from_model_index = bool(self._id2label)
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  def _ensure_labels_loaded(self) -> None:
115
  if self._labels_loaded_from_model_index:
116
  return
 
261
  )
262
  return packed_latents, image_sizes
263
 
 
 
 
 
 
264
  def _apply_classifier_free_guidance(
265
  self,
266
  model_output: torch.Tensor,
 
324
  guidance_scale (`float`, defaults to `1.0`):
325
  Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
326
  guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
327
+ Flow-time interval where CFG is applied.
 
328
  generator (`torch.Generator`, *optional*):
329
  RNG for reproducibility.
330
  output_type (`str`, defaults to `"pil"`):
 
337
  width = int(width or default_size)
338
  self.check_inputs(height, width, num_inference_steps, output_type)
339
 
 
 
 
 
 
 
 
 
340
  device = self._execution_device
341
  model_dtype = next(self.transformer.parameters()).dtype
342
  class_labels_tensor = self._normalize_class_labels(class_labels)
 
348
  self.scheduler.set_timesteps(num_inference_steps, device=device)
349
  num_train_timesteps = self.scheduler.config.num_train_timesteps
350
 
351
+ if getattr(self.scheduler.config, "stochastic_sampling", False):
352
+ raise ValueError(
353
+ "NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
354
+ "(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
355
+ "path uses a different update rule than the official NiT Euler-Maruyama SDE and "
356
+ "produces salt-and-pepper noise."
357
+ )
358
+
359
  null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
360
  guidance_low, guidance_high = guidance_interval
361
 
362
  for t in self.progress_bar(self.scheduler.timesteps):
363
+ flow_time = float(t) / num_train_timesteps
364
  guidance_active = guidance_low <= flow_time <= guidance_high
365
  if guidance_scale > 1.0 and guidance_active:
366
  model_input = torch.cat([packed_latents, packed_latents], dim=0)
 
395
  return (image,)
396
  return ImagePipelineOutput(images=image)
397
 
398
+ NiTPipelineOutput = ImagePipelineOutput
 
NiT-S/pipeline.py CHANGED
@@ -1,3 +1,11 @@
 
 
 
 
 
 
 
 
1
  # Copyright 2026 The HuggingFace Team. All rights reserved.
2
  #
3
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,27 +22,24 @@
14
 
15
  import json
16
  from pathlib import Path
17
- from typing import Dict, List, Optional, Tuple, Union
18
 
19
  import torch
20
 
21
  from diffusers.image_processor import VaeImageProcessor
22
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
23
- from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
24
  from diffusers.utils.torch_utils import randn_tensor
25
 
26
- # Local component classes are loaded dynamically in from_pretrained.
27
-
28
- DEFAULT_NATIVE_RESOLUTION = 256
29
 
30
  EXAMPLE_DOC_STRING = """
31
  Examples:
32
  ```py
33
  >>> from pathlib import Path
34
- >>> import torch
35
  >>> from diffusers import DiffusionPipeline
 
36
 
37
- >>> model_dir = Path("./NiT-S").resolve()
38
  >>> pipe = DiffusionPipeline.from_pretrained(
39
  ... str(model_dir),
40
  ... local_files_only=True,
@@ -50,39 +55,45 @@ EXAMPLE_DOC_STRING = """
50
  >>> generator = torch.Generator(device="cuda").manual_seed(42)
51
  >>> image = pipe(
52
  ... class_labels="golden retriever",
53
- ... height=256,
54
- ... width=256,
55
  ... num_inference_steps=250,
56
- ... guidance_scale=2.25,
57
  ... guidance_interval=(0.0, 0.7),
58
  ... generator=generator,
59
  ... ).images[0]
60
- >>> image.save("demo.png")
61
  ```
62
  """
63
 
64
-
65
  class NiTPipeline(DiffusionPipeline):
66
  r"""
67
  Pipeline for native-resolution class-conditional image generation with NiT.
68
 
69
- Uses the native [`FlowMatchEulerDiscreteScheduler`] in deterministic (ODE) mode.
70
- The official NiT repo defaults to an Euler-Maruyama SDE sampler for 512×512; that SDE is
71
- not the same as the scheduler's `stochastic_sampling` path, so keep
72
- `scheduler.config.stochastic_sampling=False` and let the scheduler perform the ODE update
73
- `x_{t+dt} = x_t + dt * v`.
74
-
75
  Parameters:
76
  transformer ([`NiTTransformer2DModel`]):
77
  Class-conditional transformer that predicts flow-matching velocity in packed latent space.
78
  scheduler ([`FlowMatchEulerDiscreteScheduler`]):
79
- Native diffusers flow-matching Euler scheduler (`stochastic_sampling=False`).
80
  vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
81
  Variational autoencoder used to decode packed transformer latents to pixels.
82
  id2label (`dict[int, str]`, *optional*):
83
  ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
84
  """
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  model_cpu_offload_seq = "transformer->vae"
87
  _optional_components = ["vae"]
88
 
@@ -100,95 +111,6 @@ class NiTPipeline(DiffusionPipeline):
100
  self.labels = self._build_label2id(self._id2label)
101
  self._labels_loaded_from_model_index = bool(self._id2label)
102
 
103
- @classmethod
104
- def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
105
- """Load a self-contained variant folder locally or from the Hub."""
106
- import importlib
107
- import sys
108
-
109
- repo_root = Path(__file__).resolve().parent
110
-
111
- if pretrained_model_name_or_path in (None, "", "."):
112
- variant = repo_root
113
- elif (
114
- isinstance(pretrained_model_name_or_path, str)
115
- and "/" in pretrained_model_name_or_path
116
- and not Path(pretrained_model_name_or_path).exists()
117
- ):
118
- from huggingface_hub import snapshot_download
119
-
120
- hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
121
- if subfolder:
122
- hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
123
- cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
124
- variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
125
- else:
126
- variant = Path(pretrained_model_name_or_path)
127
- if not variant.is_absolute():
128
- candidate = (Path.cwd() / variant).resolve()
129
- variant = candidate if candidate.exists() else (repo_root / variant).resolve()
130
- if subfolder:
131
- variant = variant / subfolder
132
-
133
- id2label_override = kwargs.pop("id2label", None)
134
- model_kwargs = dict(kwargs)
135
- inserted: List[str] = []
136
-
137
- def _load_component(folder: str, module_name: str, class_name: str):
138
- comp_dir = variant / folder
139
- module_path = comp_dir / f"{module_name}.py"
140
- has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
141
- if not module_path.exists() or not has_weights:
142
- return None
143
-
144
- comp_path = str(comp_dir)
145
- if comp_path not in sys.path:
146
- sys.path.insert(0, comp_path)
147
- inserted.append(comp_path)
148
-
149
- module = importlib.import_module(module_name)
150
- component_cls = getattr(module, class_name)
151
- return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
152
-
153
- try:
154
- transformer = _load_component("transformer", "nit_transformer_2d", "NiTTransformer2DModel")
155
- try:
156
- scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
157
- except Exception:
158
- scheduler = FlowMatchEulerDiscreteScheduler(
159
- num_train_timesteps=1000,
160
- shift=1.0,
161
- stochastic_sampling=False,
162
- )
163
- if transformer is None:
164
- raise ValueError(f"No loadable transformer found under {variant}")
165
-
166
- vae = None
167
- vae_dir = variant / "vae"
168
- if vae_dir.exists() and (vae_dir / "config.json").exists():
169
- from diffusers import AutoencoderDC, AutoencoderKL
170
-
171
- vae_class_name = json.loads((vae_dir / "config.json").read_text(encoding="utf-8")).get(
172
- "_class_name", "AutoencoderDC"
173
- )
174
- vae_cls = AutoencoderDC if vae_class_name == "AutoencoderDC" else AutoencoderKL
175
- vae = vae_cls.from_pretrained(str(vae_dir), **model_kwargs)
176
-
177
- id2label = id2label_override or cls._read_id2label_from_model_index(str(variant))
178
- pipe = cls(
179
- transformer=transformer,
180
- scheduler=scheduler,
181
- vae=vae,
182
- id2label=id2label,
183
- )
184
- if hasattr(pipe, "register_to_config"):
185
- pipe.register_to_config(_name_or_path=str(variant))
186
- return pipe
187
- finally:
188
- for comp_path in inserted:
189
- if comp_path in sys.path:
190
- sys.path.remove(comp_path)
191
-
192
  def _ensure_labels_loaded(self) -> None:
193
  if self._labels_loaded_from_model_index:
194
  return
@@ -339,11 +261,6 @@ class NiTPipeline(DiffusionPipeline):
339
  )
340
  return packed_latents, image_sizes
341
 
342
- @staticmethod
343
- def _flow_time_from_scheduler_timestep(timestep: torch.Tensor, num_train_timesteps: int) -> float:
344
- """Map native scheduler timesteps (sigma * num_train_timesteps) to NiT flow time in [0, 1]."""
345
- return float(timestep) / num_train_timesteps
346
-
347
  def _apply_classifier_free_guidance(
348
  self,
349
  model_output: torch.Tensor,
@@ -407,8 +324,7 @@ class NiTPipeline(DiffusionPipeline):
407
  guidance_scale (`float`, defaults to `1.0`):
408
  Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
409
  guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
410
- Flow-time interval where CFG is applied. Uses continuous flow time
411
- `timestep / num_train_timesteps`, matching the official NiT ODE sampler.
412
  generator (`torch.Generator`, *optional*):
413
  RNG for reproducibility.
414
  output_type (`str`, defaults to `"pil"`):
@@ -421,14 +337,6 @@ class NiTPipeline(DiffusionPipeline):
421
  width = int(width or default_size)
422
  self.check_inputs(height, width, num_inference_steps, output_type)
423
 
424
- if getattr(self.scheduler.config, "stochastic_sampling", False):
425
- raise ValueError(
426
- "NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
427
- "(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
428
- "path uses a different update rule than the official NiT Euler-Maruyama SDE and "
429
- "produces salt-and-pepper noise."
430
- )
431
-
432
  device = self._execution_device
433
  model_dtype = next(self.transformer.parameters()).dtype
434
  class_labels_tensor = self._normalize_class_labels(class_labels)
@@ -440,11 +348,19 @@ class NiTPipeline(DiffusionPipeline):
440
  self.scheduler.set_timesteps(num_inference_steps, device=device)
441
  num_train_timesteps = self.scheduler.config.num_train_timesteps
442
 
 
 
 
 
 
 
 
 
443
  null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
444
  guidance_low, guidance_high = guidance_interval
445
 
446
  for t in self.progress_bar(self.scheduler.timesteps):
447
- flow_time = self._flow_time_from_scheduler_timestep(t, num_train_timesteps)
448
  guidance_active = guidance_low <= flow_time <= guidance_high
449
  if guidance_scale > 1.0 and guidance_active:
450
  model_input = torch.cat([packed_latents, packed_latents], dim=0)
@@ -479,5 +395,4 @@ class NiTPipeline(DiffusionPipeline):
479
  return (image,)
480
  return ImagePipelineOutput(images=image)
481
 
482
-
483
- NiTPipelineOutput = ImagePipelineOutput
 
1
+ """Hub custom pipeline: NiTPipeline.
2
+ Load with native Hugging Face diffusers and trust_remote_code=True.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import inspect
8
+
9
  # Copyright 2026 The HuggingFace Team. All rights reserved.
10
  #
11
  # Licensed under the Apache License, Version 2.0 (the "License");
 
22
 
23
  import json
24
  from pathlib import Path
25
+ from typing import Dict, List, Optional, Tuple, Union, Any
26
 
27
  import torch
28
 
29
  from diffusers.image_processor import VaeImageProcessor
30
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
31
  from diffusers.utils.torch_utils import randn_tensor
32
 
33
+ DEFAULT_NATIVE_RESOLUTION = 512
 
 
34
 
35
  EXAMPLE_DOC_STRING = """
36
  Examples:
37
  ```py
38
  >>> from pathlib import Path
 
39
  >>> from diffusers import DiffusionPipeline
40
+ >>> import torch
41
 
42
+ >>> model_dir = Path("./NiT-XL").resolve()
43
  >>> pipe = DiffusionPipeline.from_pretrained(
44
  ... str(model_dir),
45
  ... local_files_only=True,
 
55
  >>> generator = torch.Generator(device="cuda").manual_seed(42)
56
  >>> image = pipe(
57
  ... class_labels="golden retriever",
58
+ ... height=512,
59
+ ... width=512,
60
  ... num_inference_steps=250,
61
+ ... guidance_scale=2.05,
62
  ... guidance_interval=(0.0, 0.7),
63
  ... generator=generator,
64
  ... ).images[0]
 
65
  ```
66
  """
67
 
 
68
  class NiTPipeline(DiffusionPipeline):
69
  r"""
70
  Pipeline for native-resolution class-conditional image generation with NiT.
71
 
 
 
 
 
 
 
72
  Parameters:
73
  transformer ([`NiTTransformer2DModel`]):
74
  Class-conditional transformer that predicts flow-matching velocity in packed latent space.
75
  scheduler ([`FlowMatchEulerDiscreteScheduler`]):
76
+ Flow-matching Euler scheduler used by NiT.
77
  vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
78
  Variational autoencoder used to decode packed transformer latents to pixels.
79
  id2label (`dict[int, str]`, *optional*):
80
  ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
81
  """
82
 
83
+ @staticmethod
84
+ def prepare_extra_step_kwargs(
85
+ scheduler,
86
+ generator=None,
87
+ eta: float | None = None,
88
+ ):
89
+ kwargs = {}
90
+ step_params = set(inspect.signature(scheduler.step).parameters.keys())
91
+ if "generator" in step_params:
92
+ kwargs["generator"] = generator
93
+ if eta is not None and "eta" in step_params:
94
+ kwargs["eta"] = eta
95
+ return kwargs
96
+
97
  model_cpu_offload_seq = "transformer->vae"
98
  _optional_components = ["vae"]
99
 
 
111
  self.labels = self._build_label2id(self._id2label)
112
  self._labels_loaded_from_model_index = bool(self._id2label)
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  def _ensure_labels_loaded(self) -> None:
115
  if self._labels_loaded_from_model_index:
116
  return
 
261
  )
262
  return packed_latents, image_sizes
263
 
 
 
 
 
 
264
  def _apply_classifier_free_guidance(
265
  self,
266
  model_output: torch.Tensor,
 
324
  guidance_scale (`float`, defaults to `1.0`):
325
  Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
326
  guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
327
+ Flow-time interval where CFG is applied.
 
328
  generator (`torch.Generator`, *optional*):
329
  RNG for reproducibility.
330
  output_type (`str`, defaults to `"pil"`):
 
337
  width = int(width or default_size)
338
  self.check_inputs(height, width, num_inference_steps, output_type)
339
 
 
 
 
 
 
 
 
 
340
  device = self._execution_device
341
  model_dtype = next(self.transformer.parameters()).dtype
342
  class_labels_tensor = self._normalize_class_labels(class_labels)
 
348
  self.scheduler.set_timesteps(num_inference_steps, device=device)
349
  num_train_timesteps = self.scheduler.config.num_train_timesteps
350
 
351
+ if getattr(self.scheduler.config, "stochastic_sampling", False):
352
+ raise ValueError(
353
+ "NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
354
+ "(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
355
+ "path uses a different update rule than the official NiT Euler-Maruyama SDE and "
356
+ "produces salt-and-pepper noise."
357
+ )
358
+
359
  null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
360
  guidance_low, guidance_high = guidance_interval
361
 
362
  for t in self.progress_bar(self.scheduler.timesteps):
363
+ flow_time = float(t) / num_train_timesteps
364
  guidance_active = guidance_low <= flow_time <= guidance_high
365
  if guidance_scale > 1.0 and guidance_active:
366
  model_input = torch.cat([packed_latents, packed_latents], dim=0)
 
395
  return (image,)
396
  return ImagePipelineOutput(images=image)
397
 
398
+ NiTPipelineOutput = ImagePipelineOutput
 
NiT-XL/pipeline.py CHANGED
@@ -1,3 +1,11 @@
 
 
 
 
 
 
 
 
1
  # Copyright 2026 The HuggingFace Team. All rights reserved.
2
  #
3
  # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,25 +22,22 @@
14
 
15
  import json
16
  from pathlib import Path
17
- from typing import Dict, List, Optional, Tuple, Union
18
 
19
  import torch
20
 
21
  from diffusers.image_processor import VaeImageProcessor
22
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
23
- from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
24
  from diffusers.utils.torch_utils import randn_tensor
25
 
26
- # Local component classes are loaded dynamically in from_pretrained.
27
-
28
  DEFAULT_NATIVE_RESOLUTION = 512
29
 
30
  EXAMPLE_DOC_STRING = """
31
  Examples:
32
  ```py
33
  >>> from pathlib import Path
34
- >>> import torch
35
  >>> from diffusers import DiffusionPipeline
 
36
 
37
  >>> model_dir = Path("./NiT-XL").resolve()
38
  >>> pipe = DiffusionPipeline.from_pretrained(
@@ -57,32 +62,38 @@ EXAMPLE_DOC_STRING = """
57
  ... guidance_interval=(0.0, 0.7),
58
  ... generator=generator,
59
  ... ).images[0]
60
- >>> image.save("demo.png")
61
  ```
62
  """
63
 
64
-
65
  class NiTPipeline(DiffusionPipeline):
66
  r"""
67
  Pipeline for native-resolution class-conditional image generation with NiT.
68
 
69
- Uses the native [`FlowMatchEulerDiscreteScheduler`] in deterministic (ODE) mode.
70
- The official NiT repo defaults to an Euler-Maruyama SDE sampler for 512×512; that SDE is
71
- not the same as the scheduler's `stochastic_sampling` path, so keep
72
- `scheduler.config.stochastic_sampling=False` and let the scheduler perform the ODE update
73
- `x_{t+dt} = x_t + dt * v`.
74
-
75
  Parameters:
76
  transformer ([`NiTTransformer2DModel`]):
77
  Class-conditional transformer that predicts flow-matching velocity in packed latent space.
78
  scheduler ([`FlowMatchEulerDiscreteScheduler`]):
79
- Native diffusers flow-matching Euler scheduler (`stochastic_sampling=False`).
80
  vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
81
  Variational autoencoder used to decode packed transformer latents to pixels.
82
  id2label (`dict[int, str]`, *optional*):
83
  ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
84
  """
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  model_cpu_offload_seq = "transformer->vae"
87
  _optional_components = ["vae"]
88
 
@@ -100,95 +111,6 @@ class NiTPipeline(DiffusionPipeline):
100
  self.labels = self._build_label2id(self._id2label)
101
  self._labels_loaded_from_model_index = bool(self._id2label)
102
 
103
- @classmethod
104
- def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
105
- """Load a self-contained variant folder locally or from the Hub."""
106
- import importlib
107
- import sys
108
-
109
- repo_root = Path(__file__).resolve().parent
110
-
111
- if pretrained_model_name_or_path in (None, "", "."):
112
- variant = repo_root
113
- elif (
114
- isinstance(pretrained_model_name_or_path, str)
115
- and "/" in pretrained_model_name_or_path
116
- and not Path(pretrained_model_name_or_path).exists()
117
- ):
118
- from huggingface_hub import snapshot_download
119
-
120
- hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
121
- if subfolder:
122
- hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
123
- cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
124
- variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
125
- else:
126
- variant = Path(pretrained_model_name_or_path)
127
- if not variant.is_absolute():
128
- candidate = (Path.cwd() / variant).resolve()
129
- variant = candidate if candidate.exists() else (repo_root / variant).resolve()
130
- if subfolder:
131
- variant = variant / subfolder
132
-
133
- id2label_override = kwargs.pop("id2label", None)
134
- model_kwargs = dict(kwargs)
135
- inserted: List[str] = []
136
-
137
- def _load_component(folder: str, module_name: str, class_name: str):
138
- comp_dir = variant / folder
139
- module_path = comp_dir / f"{module_name}.py"
140
- has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
141
- if not module_path.exists() or not has_weights:
142
- return None
143
-
144
- comp_path = str(comp_dir)
145
- if comp_path not in sys.path:
146
- sys.path.insert(0, comp_path)
147
- inserted.append(comp_path)
148
-
149
- module = importlib.import_module(module_name)
150
- component_cls = getattr(module, class_name)
151
- return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
152
-
153
- try:
154
- transformer = _load_component("transformer", "nit_transformer_2d", "NiTTransformer2DModel")
155
- try:
156
- scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(str(variant), subfolder="scheduler")
157
- except Exception:
158
- scheduler = FlowMatchEulerDiscreteScheduler(
159
- num_train_timesteps=1000,
160
- shift=1.0,
161
- stochastic_sampling=False,
162
- )
163
- if transformer is None:
164
- raise ValueError(f"No loadable transformer found under {variant}")
165
-
166
- vae = None
167
- vae_dir = variant / "vae"
168
- if vae_dir.exists() and (vae_dir / "config.json").exists():
169
- from diffusers import AutoencoderDC, AutoencoderKL
170
-
171
- vae_class_name = json.loads((vae_dir / "config.json").read_text(encoding="utf-8")).get(
172
- "_class_name", "AutoencoderDC"
173
- )
174
- vae_cls = AutoencoderDC if vae_class_name == "AutoencoderDC" else AutoencoderKL
175
- vae = vae_cls.from_pretrained(str(vae_dir), **model_kwargs)
176
-
177
- id2label = id2label_override or cls._read_id2label_from_model_index(str(variant))
178
- pipe = cls(
179
- transformer=transformer,
180
- scheduler=scheduler,
181
- vae=vae,
182
- id2label=id2label,
183
- )
184
- if hasattr(pipe, "register_to_config"):
185
- pipe.register_to_config(_name_or_path=str(variant))
186
- return pipe
187
- finally:
188
- for comp_path in inserted:
189
- if comp_path in sys.path:
190
- sys.path.remove(comp_path)
191
-
192
  def _ensure_labels_loaded(self) -> None:
193
  if self._labels_loaded_from_model_index:
194
  return
@@ -339,11 +261,6 @@ class NiTPipeline(DiffusionPipeline):
339
  )
340
  return packed_latents, image_sizes
341
 
342
- @staticmethod
343
- def _flow_time_from_scheduler_timestep(timestep: torch.Tensor, num_train_timesteps: int) -> float:
344
- """Map native scheduler timesteps (sigma * num_train_timesteps) to NiT flow time in [0, 1]."""
345
- return float(timestep) / num_train_timesteps
346
-
347
  def _apply_classifier_free_guidance(
348
  self,
349
  model_output: torch.Tensor,
@@ -407,8 +324,7 @@ class NiTPipeline(DiffusionPipeline):
407
  guidance_scale (`float`, defaults to `1.0`):
408
  Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
409
  guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
410
- Flow-time interval where CFG is applied. Uses continuous flow time
411
- `timestep / num_train_timesteps`, matching the official NiT ODE sampler.
412
  generator (`torch.Generator`, *optional*):
413
  RNG for reproducibility.
414
  output_type (`str`, defaults to `"pil"`):
@@ -421,14 +337,6 @@ class NiTPipeline(DiffusionPipeline):
421
  width = int(width or default_size)
422
  self.check_inputs(height, width, num_inference_steps, output_type)
423
 
424
- if getattr(self.scheduler.config, "stochastic_sampling", False):
425
- raise ValueError(
426
- "NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
427
- "(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
428
- "path uses a different update rule than the official NiT Euler-Maruyama SDE and "
429
- "produces salt-and-pepper noise."
430
- )
431
-
432
  device = self._execution_device
433
  model_dtype = next(self.transformer.parameters()).dtype
434
  class_labels_tensor = self._normalize_class_labels(class_labels)
@@ -440,11 +348,19 @@ class NiTPipeline(DiffusionPipeline):
440
  self.scheduler.set_timesteps(num_inference_steps, device=device)
441
  num_train_timesteps = self.scheduler.config.num_train_timesteps
442
 
 
 
 
 
 
 
 
 
443
  null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
444
  guidance_low, guidance_high = guidance_interval
445
 
446
  for t in self.progress_bar(self.scheduler.timesteps):
447
- flow_time = self._flow_time_from_scheduler_timestep(t, num_train_timesteps)
448
  guidance_active = guidance_low <= flow_time <= guidance_high
449
  if guidance_scale > 1.0 and guidance_active:
450
  model_input = torch.cat([packed_latents, packed_latents], dim=0)
@@ -479,5 +395,4 @@ class NiTPipeline(DiffusionPipeline):
479
  return (image,)
480
  return ImagePipelineOutput(images=image)
481
 
482
-
483
- NiTPipelineOutput = ImagePipelineOutput
 
1
+ """Hub custom pipeline: NiTPipeline.
2
+ Load with native Hugging Face diffusers and trust_remote_code=True.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import inspect
8
+
9
  # Copyright 2026 The HuggingFace Team. All rights reserved.
10
  #
11
  # Licensed under the Apache License, Version 2.0 (the "License");
 
22
 
23
  import json
24
  from pathlib import Path
25
+ from typing import Dict, List, Optional, Tuple, Union, Any
26
 
27
  import torch
28
 
29
  from diffusers.image_processor import VaeImageProcessor
30
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
31
  from diffusers.utils.torch_utils import randn_tensor
32
 
 
 
33
  DEFAULT_NATIVE_RESOLUTION = 512
34
 
35
  EXAMPLE_DOC_STRING = """
36
  Examples:
37
  ```py
38
  >>> from pathlib import Path
 
39
  >>> from diffusers import DiffusionPipeline
40
+ >>> import torch
41
 
42
  >>> model_dir = Path("./NiT-XL").resolve()
43
  >>> pipe = DiffusionPipeline.from_pretrained(
 
62
  ... guidance_interval=(0.0, 0.7),
63
  ... generator=generator,
64
  ... ).images[0]
 
65
  ```
66
  """
67
 
 
68
  class NiTPipeline(DiffusionPipeline):
69
  r"""
70
  Pipeline for native-resolution class-conditional image generation with NiT.
71
 
 
 
 
 
 
 
72
  Parameters:
73
  transformer ([`NiTTransformer2DModel`]):
74
  Class-conditional transformer that predicts flow-matching velocity in packed latent space.
75
  scheduler ([`FlowMatchEulerDiscreteScheduler`]):
76
+ Flow-matching Euler scheduler used by NiT.
77
  vae ([`AutoencoderDC`] or [`AutoencoderKL`], *optional*):
78
  Variational autoencoder used to decode packed transformer latents to pixels.
79
  id2label (`dict[int, str]`, *optional*):
80
  ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
81
  """
82
 
83
+ @staticmethod
84
+ def prepare_extra_step_kwargs(
85
+ scheduler,
86
+ generator=None,
87
+ eta: float | None = None,
88
+ ):
89
+ kwargs = {}
90
+ step_params = set(inspect.signature(scheduler.step).parameters.keys())
91
+ if "generator" in step_params:
92
+ kwargs["generator"] = generator
93
+ if eta is not None and "eta" in step_params:
94
+ kwargs["eta"] = eta
95
+ return kwargs
96
+
97
  model_cpu_offload_seq = "transformer->vae"
98
  _optional_components = ["vae"]
99
 
 
111
  self.labels = self._build_label2id(self._id2label)
112
  self._labels_loaded_from_model_index = bool(self._id2label)
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  def _ensure_labels_loaded(self) -> None:
115
  if self._labels_loaded_from_model_index:
116
  return
 
261
  )
262
  return packed_latents, image_sizes
263
 
 
 
 
 
 
264
  def _apply_classifier_free_guidance(
265
  self,
266
  model_output: torch.Tensor,
 
324
  guidance_scale (`float`, defaults to `1.0`):
325
  Classifier-free guidance scale. CFG is active when `guidance_scale > 1.0`.
326
  guidance_interval (`tuple[float, float]`, defaults to `(0.0, 1.0)`):
327
+ Flow-time interval where CFG is applied.
 
328
  generator (`torch.Generator`, *optional*):
329
  RNG for reproducibility.
330
  output_type (`str`, defaults to `"pil"`):
 
337
  width = int(width or default_size)
338
  self.check_inputs(height, width, num_inference_steps, output_type)
339
 
 
 
 
 
 
 
 
 
340
  device = self._execution_device
341
  model_dtype = next(self.transformer.parameters()).dtype
342
  class_labels_tensor = self._normalize_class_labels(class_labels)
 
348
  self.scheduler.set_timesteps(num_inference_steps, device=device)
349
  num_train_timesteps = self.scheduler.config.num_train_timesteps
350
 
351
+ if getattr(self.scheduler.config, "stochastic_sampling", False):
352
+ raise ValueError(
353
+ "NiT expects deterministic FlowMatchEulerDiscreteScheduler stepping "
354
+ "(scheduler.config.stochastic_sampling=False). The scheduler's stochastic_sampling "
355
+ "path uses a different update rule than the official NiT Euler-Maruyama SDE and "
356
+ "produces salt-and-pepper noise."
357
+ )
358
+
359
  null_labels = torch.full_like(class_labels_tensor, self.transformer.config.num_classes)
360
  guidance_low, guidance_high = guidance_interval
361
 
362
  for t in self.progress_bar(self.scheduler.timesteps):
363
+ flow_time = float(t) / num_train_timesteps
364
  guidance_active = guidance_low <= flow_time <= guidance_high
365
  if guidance_scale > 1.0 and guidance_active:
366
  model_input = torch.cat([packed_latents, packed_latents], dim=0)
 
395
  return (image,)
396
  return ImagePipelineOutput(images=image)
397
 
398
+ NiTPipelineOutput = ImagePipelineOutput