Delete custom_pipeline

Browse files

Files changed (8) hide show

custom_pipeline/__init__.py +0 -30
custom_pipeline/__pycache__/__init__.cpython-312.pyc +0 -0
custom_pipeline/__pycache__/pipeline_nit.cpython-312.pyc +0 -0
custom_pipeline/__pycache__/scheduling_flow_match_nit.cpython-312.pyc +0 -0
custom_pipeline/__pycache__/transformer_nit.cpython-312.pyc +0 -0
custom_pipeline/pipeline_nit.py +0 -237
custom_pipeline/scheduling_flow_match_nit.py +0 -187
custom_pipeline/transformer_nit.py +0 -471

custom_pipeline/__init__.py DELETED Viewed

@@ -1,30 +0,0 @@
-from .pipeline_nit import NiTPipeline, NiTPipelineOutput
-from .transformer_nit import NiTTransformer2DModel, NiTTransformer2DModelOutput
-from .scheduling_flow_match_nit import NiTFlowMatchScheduler, NiTFlowMatchSchedulerOutput
-def _register_with_diffusers():
-    """
-    Expose NiT classes on the `diffusers` namespace so pipeline/component loading
-    via `from_pretrained()` can resolve entries declared in model_index.json.
-    """
-    try:
-        import diffusers
-    except Exception:
-        return
-    setattr(diffusers, "NiTPipeline", NiTPipeline)
-    setattr(diffusers, "NiTTransformer2DModel", NiTTransformer2DModel)
-    setattr(diffusers, "NiTFlowMatchScheduler", NiTFlowMatchScheduler)
-_register_with_diffusers()
-__all__ = [
-    "NiTPipeline",
-    "NiTPipelineOutput",
-    "NiTTransformer2DModel",
-    "NiTTransformer2DModelOutput",
-    "NiTFlowMatchScheduler",
-    "NiTFlowMatchSchedulerOutput",
-]

custom_pipeline/__pycache__/__init__.cpython-312.pyc DELETED Viewed

Binary file (1.11 kB)

custom_pipeline/__pycache__/pipeline_nit.cpython-312.pyc DELETED Viewed

Binary file (12.3 kB)

custom_pipeline/__pycache__/scheduling_flow_match_nit.cpython-312.pyc DELETED Viewed

Binary file (11.3 kB)

custom_pipeline/__pycache__/transformer_nit.cpython-312.pyc DELETED Viewed

Binary file (31.4 kB)

custom_pipeline/pipeline_nit.py DELETED Viewed

@@ -1,237 +0,0 @@
-# Copyright 2026 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-import torch
-try:
-    from diffusers.image_processor import VaeImageProcessor
-    from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-    from diffusers.utils import BaseOutput
-except Exception:  # pragma: no cover - importable without a full diffusers install.
-    class BaseOutput(dict):
-        def __post_init__(self):
-            self.update(self.__dict__)
-    class DiffusionPipeline:
-        def register_modules(self, **kwargs):
-            for name, module in kwargs.items():
-                setattr(self, name, module)
-        @property
-        def _execution_device(self):
-            return torch.device("cpu")
-        def maybe_free_model_hooks(self):
-            pass
-    class VaeImageProcessor:
-        def postprocess(self, image, output_type="pil"):
-            return image
-@dataclass
-class NiTPipelineOutput(BaseOutput):
-    images: Union[torch.FloatTensor, List]
-class NiTPipeline(DiffusionPipeline):
-    r"""
-    Native-resolution Image Synthesis pipeline using a class-conditional NiT transformer.
-    This pipeline follows Diffusers conventions: transformer, scheduler, and VAE are
-    saved as separate subfolders and restored with `DiffusionPipeline.from_pretrained`.
-    The transformer predicts flow-matching velocity in latent space.
-    """
-    model_cpu_offload_seq = "transformer->vae"
-    _optional_components = ["vae"]
-    def __init__(self, transformer, scheduler, vae=None):
-        super().__init__()
-        self.register_modules(transformer=transformer, scheduler=scheduler, vae=vae)
-        self.image_processor = VaeImageProcessor()
-    def _prepare_latents(
-        self,
-        batch_size: int,
-        height: int,
-        width: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
-    ) -> Tuple[torch.Tensor, torch.LongTensor]:
-        if self.vae is None:
-            spatial_downsample = 1
-        elif self.vae.__class__.__name__ == "AutoencoderDC" or "dc-ae" in getattr(self.vae.config, "_name_or_path", ""):
-            spatial_downsample = 32
-        else:
-            spatial_downsample = getattr(self.vae.config, "block_out_channels", [0, 0, 0, 0])
-            spatial_downsample = 2 ** (len(spatial_downsample) - 1)
-        if height % spatial_downsample != 0 or width % spatial_downsample != 0:
-            raise ValueError(f"height and width must be divisible by the VAE downsample factor {spatial_downsample}.")
-        latent_height = height // spatial_downsample
-        latent_width = width // spatial_downsample
-        patch_size = int(self.transformer.config.patch_size)
-        if latent_height % patch_size != 0 or latent_width % patch_size != 0:
-            raise ValueError("Latent height and width must be divisible by transformer's patch_size.")
-        token_height = latent_height // patch_size
-        token_width = latent_width // patch_size
-        image_sizes = torch.tensor([[token_height, token_width]] * batch_size, device=device, dtype=torch.long)
-        # Match native NiT sampler initialization exactly: sample directly in packed-token space.
-        packed_shape = (
-            batch_size * token_height * token_width,
-            self.transformer.config.in_channels,
-            patch_size,
-            patch_size,
-        )
-        packed_latents = torch.randn(packed_shape, generator=generator, device=device, dtype=dtype)
-        return packed_latents, image_sizes
-    def _apply_classifier_free_guidance(
-        self,
-        model_output: torch.Tensor,
-        guidance_scale: float,
-        guidance_active: bool,
-    ) -> torch.Tensor:
-        if guidance_scale <= 1.0 or not guidance_active:
-            return model_output
-        model_output_cond, model_output_uncond = model_output.chunk(2)
-        return model_output_uncond + guidance_scale * (model_output_cond - model_output_uncond)
-    def _get_vae_dtype(self, latents: torch.Tensor) -> torch.dtype:
-        vae_dtype = getattr(self.vae, "dtype", None)
-        if vae_dtype is not None:
-            return vae_dtype
-        vae_params = next(self.vae.parameters(), None)
-        return vae_params.dtype if vae_params is not None else latents.dtype
-    def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
-        if self.vae is None:
-            return latents
-        vae_dtype = self._get_vae_dtype(latents)
-        latents = latents.to(dtype=vae_dtype)
-        scaling_factor = getattr(self.vae.config, "scaling_factor", 1.0)
-        latents = latents / scaling_factor
-        if self.vae.__class__.__name__ == "AutoencoderDC":
-            image = self.vae._decode(latents)
-        else:
-            image = self.vae.decode(latents)
-            image = image.sample if hasattr(image, "sample") else image
-        return image
-    @torch.no_grad()
-    def __call__(
-        self,
-        class_labels: Union[int, List[int], torch.LongTensor],
-        height: int = 256,
-        width: int = 256,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 1.0,
-        guidance_interval: Tuple[float, float] = (0.0, 1.0),
-        mode: str = "ode",
-        heun: bool = False,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-    ) -> Union[NiTPipelineOutput, Tuple]:
-        device = self._execution_device
-        model_dtype = next(self.transformer.parameters()).dtype
-        if isinstance(class_labels, int):
-            class_labels = [class_labels]
-        if not torch.is_tensor(class_labels):
-            class_labels = torch.tensor(class_labels, device=device, dtype=torch.long)
-        else:
-            class_labels = class_labels.to(device=device, dtype=torch.long)
-        batch_size = class_labels.numel()
-        packed_latents, image_sizes = self._prepare_latents(batch_size, height, width, model_dtype, device, generator)
-        packed_latents = packed_latents.to(dtype=torch.float64)
-        timesteps = self.scheduler.set_timesteps(num_inference_steps, device=device, mode=mode)
-        null_labels = torch.full_like(class_labels, self.transformer.config.num_classes)
-        for index, timestep in enumerate(timesteps[:-1]):
-            next_timestep = timesteps[index + 1]
-            guidance_active = guidance_interval[0] <= float(timestep) <= guidance_interval[1]
-            if guidance_scale > 1.0 and guidance_active:
-                model_input = torch.cat([packed_latents, packed_latents], dim=0)
-                labels = torch.cat([class_labels, null_labels], dim=0)
-                model_image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
-            else:
-                model_input = packed_latents
-                labels = class_labels
-                model_image_sizes = image_sizes
-            timestep_batch = torch.full((labels.numel(),), float(timestep), device=device, dtype=model_dtype)
-            model_output = self.transformer(
-                model_input.to(dtype=model_dtype),
-                timestep_batch,
-                labels,
-                image_sizes=model_image_sizes,
-                return_dict=True,
-            ).sample
-            model_output = self._apply_classifier_free_guidance(model_output, guidance_scale, guidance_active)
-            if heun and mode == "ode" and index < len(timesteps) - 2:
-                provisional = self.scheduler.step(
-                    model_output,
-                    timestep[None],
-                    packed_latents,
-                    next_timestep[None],
-                    image_sizes=image_sizes,
-                ).prev_sample
-                if guidance_scale > 1.0 and guidance_active:
-                    prime_input = torch.cat([provisional, provisional], dim=0)
-                    labels = torch.cat([class_labels, null_labels], dim=0)
-                    model_image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
-                else:
-                    prime_input = provisional
-                    labels = class_labels
-                    model_image_sizes = image_sizes
-                next_timestep_batch = torch.full((labels.numel(),), float(next_timestep), device=device, dtype=model_dtype)
-                next_model_output = self.transformer(
-                    prime_input.to(dtype=model_dtype),
-                    next_timestep_batch,
-                    labels,
-                    image_sizes=model_image_sizes,
-                    return_dict=True,
-                ).sample
-                next_model_output = self._apply_classifier_free_guidance(
-                    next_model_output, guidance_scale, guidance_active
-                )
-                packed_latents = self.scheduler.step_heun(
-                    model_output, next_model_output, timestep[None], packed_latents, next_timestep[None]
-                ).prev_sample
-            else:
-                packed_latents = self.scheduler.step(
-                    model_output,
-                    timestep[None],
-                    packed_latents,
-                    next_timestep[None],
-                    image_sizes=image_sizes,
-                    generator=generator,
-                ).prev_sample
-        latents = self.transformer._unpack_latents(packed_latents, image_sizes)
-        image = self._decode_latents(latents)
-        if self.vae is not None:
-            image = (image / 2 + 0.5).clamp(0, 1)
-            image = self.image_processor.postprocess(
-                image,
-                output_type=output_type,
-                do_denormalize=[False] * image.shape[0],
-            )
-        self.maybe_free_model_hooks()
-        if not return_dict:
-            return (image,)
-        return NiTPipelineOutput(images=image)

custom_pipeline/scheduling_flow_match_nit.py DELETED Viewed

@@ -1,187 +0,0 @@
-# Copyright 2026 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-from dataclasses import dataclass
-from typing import Optional, Tuple
-import numpy as np
-import torch
-try:
-    from diffusers.configuration_utils import ConfigMixin, register_to_config
-    from diffusers.schedulers.scheduling_utils import SchedulerMixin
-    from diffusers.utils import BaseOutput
-except Exception:  # pragma: no cover - importable without an installed diffusers checkout.
-    class BaseOutput(dict):
-        def __post_init__(self):
-            self.update(self.__dict__)
-    class ConfigMixin:
-        config_name = "scheduler_config.json"
-    class SchedulerMixin:
-        pass
-    def register_to_config(init):
-        return init
-@dataclass
-class NiTFlowMatchSchedulerOutput(BaseOutput):
-    prev_sample: torch.FloatTensor
-class NiTFlowMatchScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Flow-matching ODE/SDE scheduler used by Native-resolution Image Synthesis (NiT).
-    The model predicts velocity with a linear path by default. Timesteps run from 1 to 0,
-    matching the original sampler while exposing the standard Diffusers `set_timesteps`
-    and `step` API.
-    """
-    config_name = "scheduler_config.json"
-    order = 1
-    @register_to_config
-    def __init__(
-        self,
-        mode: str = "ode",
-        path_type: str = "linear",
-        num_train_timesteps: int = 1000,
-    ):
-        if mode not in {"ode", "sde"}:
-            raise ValueError("mode must be either 'ode' or 'sde'.")
-        if path_type not in {"linear", "cosine"}:
-            raise ValueError("path_type must be either 'linear' or 'cosine'.")
-        self.mode = mode
-        self.path_type = path_type
-        self.num_train_timesteps = num_train_timesteps
-        # Native NiT integrates in float64 for better numerical stability.
-        self.timesteps = torch.from_numpy(np.linspace(1.0, 0.0, num_train_timesteps + 1)).to(dtype=torch.float64)
-    def set_timesteps(
-        self,
-        num_inference_steps: int,
-        device: Optional[torch.device] = None,
-        mode: Optional[str] = None,
-    ):
-        mode = mode or self.mode
-        dtype = self.timesteps.dtype
-        if mode == "sde":
-            timesteps = torch.linspace(1.0, 0.04, num_inference_steps, dtype=dtype)
-            timesteps = torch.cat([timesteps, torch.zeros(1, dtype=dtype)])
-        elif mode == "ode":
-            timesteps = torch.linspace(1.0, 0.0, num_inference_steps + 1, dtype=dtype)
-        else:
-            raise ValueError("mode must be either 'ode' or 'sde'.")
-        self.mode = mode
-        self.timesteps = timesteps.to(device=device)
-        return self.timesteps
-    @staticmethod
-    def _expand_t_like_sample(timestep: torch.Tensor, sample: torch.Tensor, image_sizes: torch.LongTensor):
-        dims = [1] * (sample.ndim - 1)
-        seqlens = image_sizes[:, 0] * image_sizes[:, 1]
-        if timestep.numel() == 1:
-            timestep = timestep.repeat(image_sizes.shape[0])
-        return torch.cat(
-            [timestep[i].reshape(1, *dims).repeat(int(seqlens[i]), *dims) for i in range(image_sizes.shape[0])]
-        )
-    def _get_score_from_velocity(
-        self,
-        model_output: torch.Tensor,
-        sample: torch.Tensor,
-        timestep: torch.Tensor,
-        image_sizes: torch.LongTensor,
-    ):
-        timestep = self._expand_t_like_sample(timestep, sample, image_sizes)
-        if self.path_type == "linear":
-            alpha_t, d_alpha_t = 1 - timestep, torch.ones_like(timestep) * -1
-            sigma_t, d_sigma_t = timestep, torch.ones_like(timestep)
-        elif self.path_type == "cosine":
-            alpha_t = torch.cos(timestep * np.pi / 2)
-            sigma_t = torch.sin(timestep * np.pi / 2)
-            d_alpha_t = -np.pi / 2 * torch.sin(timestep * np.pi / 2)
-            d_sigma_t = np.pi / 2 * torch.cos(timestep * np.pi / 2)
-        else:
-            raise ValueError(f"Unsupported path_type: {self.path_type}")
-        reverse_alpha_ratio = alpha_t / d_alpha_t
-        variance = sigma_t**2 - reverse_alpha_ratio * d_sigma_t * sigma_t
-        return (reverse_alpha_ratio * model_output - sample) / variance
-    @staticmethod
-    def _compute_diffusion(timestep: torch.Tensor):
-        return 2 * timestep
-    @staticmethod
-    def _promote_dtypes(*tensors: torch.Tensor) -> torch.dtype:
-        dtype = None
-        for tensor in tensors:
-            if tensor.is_floating_point() or tensor.is_complex():
-                dtype = tensor.dtype if dtype is None else torch.promote_types(dtype, tensor.dtype)
-        return dtype if dtype is not None else torch.get_default_dtype()
-    def step(
-        self,
-        model_output: torch.Tensor,
-        timestep: torch.Tensor,
-        sample: torch.Tensor,
-        next_timestep: torch.Tensor,
-        image_sizes: Optional[torch.LongTensor] = None,
-        generator: Optional[torch.Generator] = None,
-        return_dict: bool = True,
-    ) -> NiTFlowMatchSchedulerOutput:
-        compute_dtype = torch.float64
-        sample = sample.to(dtype=compute_dtype)
-        model_output = model_output.to(dtype=compute_dtype)
-        timestep = timestep.to(device=sample.device, dtype=compute_dtype).flatten()
-        next_timestep = next_timestep.to(device=sample.device, dtype=compute_dtype).flatten()
-        if self.mode == "ode":
-            prev_sample = sample + (next_timestep[0] - timestep[0]) * model_output
-        else:
-            if image_sizes is None:
-                raise ValueError("image_sizes are required for SDE sampling.")
-            image_sizes = image_sizes.to(device=sample.device, dtype=torch.long)
-            diffusion = self._compute_diffusion(timestep[0])
-            score = self._get_score_from_velocity(model_output, sample, timestep, image_sizes)
-            drift = model_output - 0.5 * diffusion * score
-            dt = next_timestep[0] - timestep[0]
-            if torch.allclose(next_timestep[0], torch.zeros_like(next_timestep[0])):
-                prev_sample = sample + drift * dt
-            else:
-                if generator is not None:
-                    noise = torch.randn(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype)
-                else:
-                    noise = torch.randn_like(sample)
-                prev_sample = sample + drift * dt + torch.sqrt(diffusion) * noise * torch.sqrt(torch.abs(dt))
-        if not return_dict:
-            return (prev_sample,)
-        return NiTFlowMatchSchedulerOutput(prev_sample=prev_sample)
-    def step_heun(
-        self,
-        model_output: torch.Tensor,
-        next_model_output: torch.Tensor,
-        timestep: torch.Tensor,
-        sample: torch.Tensor,
-        next_timestep: torch.Tensor,
-        return_dict: bool = True,
-    ) -> NiTFlowMatchSchedulerOutput:
-        if self.mode != "ode":
-            raise ValueError("Heun correction is only defined for ODE sampling.")
-        compute_dtype = torch.float64
-        sample = sample.to(dtype=compute_dtype)
-        model_output = model_output.to(dtype=compute_dtype)
-        next_model_output = next_model_output.to(dtype=compute_dtype)
-        timestep = timestep.to(device=sample.device, dtype=compute_dtype).flatten()
-        next_timestep = next_timestep.to(device=sample.device, dtype=compute_dtype).flatten()
-        prev_sample = sample + (next_timestep[0] - timestep[0]) * (0.5 * model_output + 0.5 * next_model_output)
-        if not return_dict:
-            return (prev_sample,)
-        return NiTFlowMatchSchedulerOutput(prev_sample=prev_sample)

custom_pipeline/transformer_nit.py DELETED Viewed

@@ -1,471 +0,0 @@
-# Copyright 2026 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-from dataclasses import dataclass
-import math
-from typing import List, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-try:
-    from diffusers.configuration_utils import ConfigMixin, register_to_config
-    from diffusers.models.modeling_utils import ModelMixin
-    from diffusers.utils import BaseOutput
-except Exception:  # pragma: no cover - lets this subtree be tested outside diffusers.
-    class BaseOutput(dict):
-        def __post_init__(self):
-            self.update(self.__dict__)
-    class _Config(dict):
-        def __getattr__(self, key):
-            try:
-                return self[key]
-            except KeyError as error:
-                raise AttributeError(key) from error
-    class ConfigMixin:
-        config_name = "config.json"
-    class ModelMixin(nn.Module):
-        pass
-    def register_to_config(init):
-        def wrapper(self, *args, **kwargs):
-            import inspect
-            signature = inspect.signature(init)
-            bound = signature.bind(self, *args, **kwargs)
-            bound.apply_defaults()
-            self.config = _Config({key: value for key, value in bound.arguments.items() if key != "self"})
-            init(self, *args, **kwargs)
-        return wrapper
-try:
-    from flash_attn import flash_attn_varlen_func
-except Exception:  # pragma: no cover - optional acceleration.
-    flash_attn_varlen_func = None
-@dataclass
-class NiTTransformer2DModelOutput(BaseOutput):
-    sample: torch.FloatTensor
-    projection_states: Optional[Tuple[torch.FloatTensor, ...]] = None
-def _modulate(hidden_states: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
-    return hidden_states * (1 + scale) + shift
-def _rotate_half(hidden_states: torch.Tensor) -> torch.Tensor:
-    hidden_states = hidden_states.reshape(*hidden_states.shape[:-1], -1, 2)
-    hidden_states_1, hidden_states_2 = hidden_states.unbind(dim=-1)
-    return torch.stack((-hidden_states_2, hidden_states_1), dim=-1).flatten(-2)
-def _get_float_dtype_or_default(tensor: Optional[torch.Tensor] = None) -> torch.dtype:
-    if tensor is not None and tensor.is_floating_point():
-        return tensor.dtype
-    return torch.get_default_dtype()
-class NiTPatchEmbed(nn.Module):
-    def __init__(self, patch_size: int, in_channels: int, hidden_size: int):
-        super().__init__()
-        self.patch_size = (patch_size, patch_size)
-        self.proj = nn.Conv2d(in_channels, hidden_size, kernel_size=patch_size, stride=patch_size, bias=True)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.proj(hidden_states)
-        return hidden_states.flatten(2).transpose(1, 2)
-class NiTTimestepEmbedder(nn.Module):
-    def __init__(self, hidden_size: int, frequency_embedding_size: int = 256):
-        super().__init__()
-        self.frequency_embedding_size = frequency_embedding_size
-        self.mlp = nn.Sequential(
-            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
-            nn.SiLU(),
-            nn.Linear(hidden_size, hidden_size, bias=True),
-        )
-    @staticmethod
-    def get_timestep_embedding(timesteps: torch.Tensor, embedding_dim: int, max_period: int = 10000):
-        half = embedding_dim // 2
-        # Keep sinusoid construction in fp32 to mirror the native NiT implementation.
-        exponent = -math.log(max_period) * torch.arange(half, dtype=torch.float32, device=timesteps.device) / half
-        freqs = torch.exp(exponent)
-        args = timesteps.float()[:, None] * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if embedding_dim % 2:
-            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-        return embedding
-    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
-        timestep_freq = self.get_timestep_embedding(timesteps, self.frequency_embedding_size).to(timesteps.dtype)
-        return self.mlp(timestep_freq)
-class NiTLabelEmbedder(nn.Module):
-    def __init__(self, num_classes: int, hidden_size: int, dropout_prob: float):
-        super().__init__()
-        use_cfg_embedding = dropout_prob > 0
-        self.embedding_table = nn.Embedding(num_classes + int(use_cfg_embedding), hidden_size)
-        self.num_classes = num_classes
-        self.dropout_prob = dropout_prob
-    def forward(self, class_labels: torch.LongTensor) -> torch.Tensor:
-        return self.embedding_table(class_labels)
-class NiTRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        head_dim: int,
-        custom_freqs: str = "normal",
-        theta: int = 10000,
-        max_cached_len: int = 1024,
-        max_pe_len_h: Optional[int] = None,
-        max_pe_len_w: Optional[int] = None,
-        decouple: bool = False,
-        ori_max_pe_len: Optional[int] = None,
-    ):
-        super().__init__()
-        del max_pe_len_h, max_pe_len_w, decouple, ori_max_pe_len
-        if custom_freqs not in {"normal", "scale1", "scale2"}:
-            raise ValueError(
-                "This Diffusers implementation supports the trained RoPE frequencies directly. "
-                "Checkpoint conversion preserves weights; extrapolation variants should be handled "
-                "by changing the model config before loading."
-            )
-        dim = head_dim // 2
-        if dim % 2 != 0:
-            raise ValueError("NiT rotary embedding requires head_dim // 2 to be even.")
-        default_dtype = _get_float_dtype_or_default()
-        freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=default_dtype) / dim))
-        self.register_buffer("freqs_h", freqs, persistent=False)
-        self.register_buffer("freqs_w", freqs.clone(), persistent=False)
-        positions = torch.arange(max_cached_len, dtype=default_dtype)
-        freqs_h_cached = torch.einsum("n,f->nf", positions, self.freqs_h).repeat_interleave(2, dim=-1)
-        freqs_w_cached = torch.einsum("n,f->nf", positions, self.freqs_w).repeat_interleave(2, dim=-1)
-        self.register_buffer("freqs_h_cached", freqs_h_cached, persistent=False)
-        self.register_buffer("freqs_w_cached", freqs_w_cached, persistent=False)
-    def forward(self, image_sizes: torch.LongTensor, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
-        grids = []
-        for height, width in image_sizes.tolist():
-            # Use the same meshgrid ordering as native NiT to preserve RoPE-token alignment.
-            grid_h = torch.arange(height, device=device)
-            grid_w = torch.arange(width, device=device)
-            grid = torch.meshgrid(grid_h, grid_w, indexing="xy")
-            grids.append(torch.stack(grid, dim=0).reshape(2, -1))
-        grid = torch.cat(grids, dim=1)
-        freqs_h = self.freqs_h_cached.to(device)[grid[0]]
-        freqs_w = self.freqs_w_cached.to(device)[grid[1]]
-        freqs = torch.cat([freqs_h, freqs_w], dim=-1)
-        return freqs.cos().unsqueeze(1), freqs.sin().unsqueeze(1)
-class NiTAttention(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, qk_norm: bool = False):
-        super().__init__()
-        if hidden_size % num_heads != 0:
-            raise ValueError("hidden_size must be divisible by num_heads")
-        self.num_heads = num_heads
-        self.head_dim = hidden_size // num_heads
-        self.qkv = nn.Linear(hidden_size, hidden_size * 3, bias=True)
-        self.q_norm = nn.LayerNorm(self.head_dim) if qk_norm else nn.Identity()
-        self.k_norm = nn.LayerNorm(self.head_dim) if qk_norm else nn.Identity()
-        self.proj = nn.Linear(hidden_size, hidden_size)
-        self.proj_drop = nn.Dropout(0.0)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        cu_seqlens: torch.IntTensor,
-        freqs_cos: torch.Tensor,
-        freqs_sin: torch.Tensor,
-    ) -> torch.Tensor:
-        qkv = self.qkv(hidden_states).reshape(hidden_states.shape[0], 3, self.num_heads, self.head_dim)
-        query, key, value = qkv.unbind(dim=1)
-        original_dtype = qkv.dtype
-        query = self.q_norm(query)
-        key = self.k_norm(key)
-        query = query * freqs_cos + _rotate_half(query) * freqs_sin
-        key = key * freqs_cos + _rotate_half(key) * freqs_sin
-        query = query.to(dtype=original_dtype)
-        key = key.to(dtype=original_dtype)
-        if flash_attn_varlen_func is not None and query.is_cuda:
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-            hidden_states = flash_attn_varlen_func(
-                query, key, value, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen
-            ).reshape(hidden_states.shape[0], -1)
-        else:
-            segments = []
-            for start, end in zip(cu_seqlens[:-1].tolist(), cu_seqlens[1:].tolist()):
-                q = query[start:end].transpose(0, 1).unsqueeze(0)
-                k = key[start:end].transpose(0, 1).unsqueeze(0)
-                v = value[start:end].transpose(0, 1).unsqueeze(0)
-                segments.append(F.scaled_dot_product_attention(q, k, v).squeeze(0).transpose(0, 1))
-            hidden_states = torch.cat(segments, dim=0).reshape(hidden_states.shape[0], -1)
-        hidden_states = self.proj(hidden_states)
-        return self.proj_drop(hidden_states)
-class NiTMLP(nn.Module):
-    def __init__(self, hidden_size: int, mlp_hidden_dim: int):
-        super().__init__()
-        self.fc1 = nn.Linear(hidden_size, mlp_hidden_dim)
-        self.act = nn.GELU(approximate="tanh")
-        self.drop1 = nn.Dropout(0.0)
-        self.norm = nn.Identity()
-        self.fc2 = nn.Linear(mlp_hidden_dim, hidden_size)
-        self.drop2 = nn.Dropout(0.0)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.drop1(hidden_states)
-        hidden_states = self.norm(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-        return self.drop2(hidden_states)
-class NiTBlock(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qk_norm: bool = False,
-        use_adaln_lora: bool = False,
-        adaln_lora_dim: int = 512,
-    ):
-        super().__init__()
-        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.attn = NiTAttention(hidden_size, num_heads=num_heads, qk_norm=qk_norm)
-        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        self.mlp = NiTMLP(hidden_size, mlp_hidden_dim)
-        if use_adaln_lora:
-            self.adaLN_modulation = nn.Sequential(
-                nn.SiLU(),
-                nn.Linear(hidden_size, adaln_lora_dim, bias=True),
-                nn.Linear(adaln_lora_dim, 6 * hidden_size, bias=True),
-            )
-        else:
-            self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
-    def forward(self, hidden_states, conditioning, cu_seqlens, freqs_cos, freqs_sin):
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(conditioning).chunk(
-            6, dim=-1
-        )
-        hidden_states = hidden_states + gate_msa * self.attn(
-            _modulate(self.norm1(hidden_states), shift_msa, scale_msa), cu_seqlens, freqs_cos, freqs_sin
-        )
-        hidden_states = hidden_states + gate_mlp * self.mlp(
-            _modulate(self.norm2(hidden_states), shift_mlp, scale_mlp)
-        )
-        return hidden_states
-class NiTFinalLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
-        super().__init__()
-        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
-        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
-    def forward(self, hidden_states: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
-        shift, scale = self.adaLN_modulation(conditioning).chunk(2, dim=-1)
-        hidden_states = _modulate(self.norm_final(hidden_states), shift, scale)
-        return self.linear(hidden_states)
-def _build_mlp(hidden_size: int, projector_dim: int, z_dim: int) -> nn.Sequential:
-    return nn.Sequential(
-        nn.Linear(hidden_size, projector_dim),
-        nn.SiLU(),
-        nn.Linear(projector_dim, projector_dim),
-        nn.SiLU(),
-        nn.Linear(projector_dim, z_dim),
-    )
-class NiTTransformer2DModel(ModelMixin, ConfigMixin):
-    config_name = "config.json"
-    @register_to_config
-    def __init__(
-        self,
-        input_size: int = 32,
-        patch_size: int = 1,
-        in_channels: int = 32,
-        hidden_size: int = 1152,
-        depth: int = 28,
-        num_heads: int = 16,
-        mlp_ratio: float = 4.0,
-        class_dropout_prob: float = 0.1,
-        num_classes: int = 1000,
-        encoder_depth: int = 8,
-        projector_dim: int = 2048,
-        z_dim: int = 1280,
-        use_checkpoint: bool = False,
-        custom_freqs: str = "normal",
-        theta: int = 10000,
-        max_pe_len_h: Optional[int] = None,
-        max_pe_len_w: Optional[int] = None,
-        decouple: bool = False,
-        ori_max_pe_len: Optional[int] = None,
-        qk_norm: bool = True,
-        use_adaln_lora: bool = False,
-        adaln_lora_dim: int = 512,
-    ):
-        super().__init__()
-        del input_size
-        self.in_channels = in_channels
-        self.out_channels = in_channels
-        self.patch_size = patch_size
-        self.num_heads = num_heads
-        self.num_classes = num_classes
-        self.encoder_depth = encoder_depth
-        self.use_checkpoint = use_checkpoint
-        self.x_embedder = NiTPatchEmbed(patch_size, in_channels, hidden_size)
-        self.t_embedder = NiTTimestepEmbedder(hidden_size)
-        self.y_embedder = NiTLabelEmbedder(num_classes, hidden_size, class_dropout_prob)
-        self.rope = NiTRotaryEmbedding(
-            hidden_size // num_heads,
-            custom_freqs=custom_freqs,
-            theta=theta,
-            max_pe_len_h=max_pe_len_h,
-            max_pe_len_w=max_pe_len_w,
-            decouple=decouple,
-            ori_max_pe_len=ori_max_pe_len,
-        )
-        self.projector = _build_mlp(hidden_size, projector_dim, z_dim)
-        self.blocks = nn.ModuleList(
-            [
-                NiTBlock(
-                    hidden_size,
-                    num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qk_norm=qk_norm,
-                    use_adaln_lora=use_adaln_lora,
-                    adaln_lora_dim=adaln_lora_dim,
-                )
-                for _ in range(depth)
-            ]
-        )
-        self.final_layer = NiTFinalLayer(hidden_size, patch_size, self.out_channels)
-    def _pack_latents(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.LongTensor, Tuple[int, int]]:
-        batch_size, channels, height, width = hidden_states.shape
-        if channels != self.in_channels:
-            raise ValueError(f"Expected {self.in_channels} latent channels, got {channels}.")
-        if height % self.patch_size != 0 or width % self.patch_size != 0:
-            raise ValueError("Latent height and width must be divisible by patch_size.")
-        latent_h = height // self.patch_size
-        latent_w = width // self.patch_size
-        hidden_states = hidden_states.reshape(batch_size, channels, latent_h, self.patch_size, latent_w, self.patch_size)
-        hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5).reshape(
-            batch_size * latent_h * latent_w, channels, self.patch_size, self.patch_size
-        )
-        image_sizes = torch.tensor([[latent_h, latent_w]] * batch_size, device=hidden_states.device, dtype=torch.long)
-        return hidden_states, image_sizes, (height, width)
-    def _unpack_latents(self, hidden_states: torch.Tensor, image_sizes: torch.LongTensor) -> torch.Tensor:
-        if image_sizes.shape[0] == 1:
-            height, width = image_sizes[0].tolist()
-            hidden_states = hidden_states.reshape(height, width, self.out_channels, self.patch_size, self.patch_size)
-            return hidden_states.permute(2, 0, 3, 1, 4).reshape(
-                1, self.out_channels, height * self.patch_size, width * self.patch_size
-            )
-        samples = []
-        cursor = 0
-        for height, width in image_sizes.tolist():
-            length = height * width
-            sample = hidden_states[cursor : cursor + length].reshape(
-                height, width, self.out_channels, self.patch_size, self.patch_size
-            )
-            samples.append(
-                sample.permute(2, 0, 3, 1, 4).reshape(
-                    self.out_channels, height * self.patch_size, width * self.patch_size
-                )
-            )
-            cursor += length
-        if len({tuple(sample.shape) for sample in samples}) != 1:
-            return hidden_states
-        return torch.stack(samples, dim=0)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        timestep: Union[torch.Tensor, float],
-        class_labels: torch.LongTensor,
-        image_sizes: Optional[Union[torch.LongTensor, List[Tuple[int, int]]]] = None,
-        return_dict: bool = True,
-        output_projection_states: bool = False,
-    ) -> Union[NiTTransformer2DModelOutput, Tuple[torch.Tensor, ...]]:
-        input_was_image = hidden_states.dim() == 4 and image_sizes is None
-        if input_was_image:
-            hidden_states, image_sizes, _ = self._pack_latents(hidden_states)
-        elif image_sizes is None:
-            raise ValueError("image_sizes must be provided when hidden_states are already packed.")
-        elif not torch.is_tensor(image_sizes):
-            image_sizes = torch.tensor(image_sizes, device=hidden_states.device, dtype=torch.long)
-        else:
-            image_sizes = image_sizes.to(device=hidden_states.device, dtype=torch.long)
-        if not torch.is_tensor(timestep):
-            timestep = torch.tensor([timestep], device=hidden_states.device, dtype=hidden_states.dtype)
-        timestep = timestep.to(device=hidden_states.device, dtype=hidden_states.dtype).flatten()
-        if timestep.numel() == 1:
-            timestep = timestep.repeat(image_sizes.shape[0])
-        class_labels = class_labels.to(device=hidden_states.device, dtype=torch.long).flatten()
-        hidden_states = self.x_embedder(hidden_states).squeeze(1)
-        freqs_cos, freqs_sin = self.rope(image_sizes, hidden_states.device)
-        seqlens = image_sizes[:, 0] * image_sizes[:, 1]
-        cu_seqlens = torch.cat(
-            [torch.zeros(1, device=hidden_states.device, dtype=torch.int32), torch.cumsum(seqlens, dim=0).int()]
-        )
-        conditioning = self.t_embedder(timestep) + self.y_embedder(class_labels)
-        conditioning = torch.cat([conditioning[i].repeat(int(seqlens[i]), 1) for i in range(image_sizes.shape[0])], dim=0)
-        projection_states = []
-        for index, block in enumerate(self.blocks):
-            if self.use_checkpoint and self.training:
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    block, hidden_states, conditioning, cu_seqlens, freqs_cos, freqs_sin, use_reentrant=False
-                )
-            else:
-                hidden_states = block(hidden_states, conditioning, cu_seqlens, freqs_cos, freqs_sin)
-            if output_projection_states and (index + 1) == self.encoder_depth:
-                projection_states.append(self.projector(hidden_states))
-        hidden_states = self.final_layer(hidden_states, conditioning)
-        hidden_states = hidden_states.reshape(hidden_states.shape[0], self.out_channels, self.patch_size, self.patch_size)
-        if input_was_image:
-            hidden_states = self._unpack_latents(hidden_states, image_sizes)
-        if not return_dict:
-            output = (hidden_states,)
-            if output_projection_states:
-                output = output + (tuple(projection_states),)
-            return output
-        return NiTTransformer2DModelOutput(
-            sample=hidden_states,
-            projection_states=tuple(projection_states) if output_projection_states else None,
-        )