BiliSakura commited on 3 days ago

Commit

d717924

verified ·

1 Parent(s): d8b2f54

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

.gitattributes +1 -0
ADM-G-512/classifier/__pycache__/classifier_adm.cpython-312.pyc +0 -0
ADM-G-512/classifier/classifier_adm.py +132 -0
ADM-G-512/classifier/config.json +13 -0
ADM-G-512/classifier/diffusion_pytorch_model.safetensors +3 -0
ADM-G-512/classifier/modeling_adm.py +772 -0
ADM-G-512/scheduler/__pycache__/scheduling_adm.cpython-312.pyc +0 -0
ADM-G-512/scheduler/scheduler_config.json +11 -0
ADM-G-512/scheduler/scheduling_adm.py +590 -0
ADM-G-512/unet/__pycache__/modeling_adm.cpython-312.pyc +0 -0
ADM-G-512/unet/__pycache__/unet_adm.cpython-312.pyc +0 -0
ADM-G-512/unet/config.json +22 -0
ADM-G-512/unet/diffusion_pytorch_model.safetensors +3 -0
ADM-G-512/unet/modeling_adm.py +772 -0
ADM-G-512/unet/unet_adm.py +124 -0
README.md +87 -0
__pycache__/pipeline.cpython-312.pyc +0 -0
demo.png +3 -0
model_index.json +16 -0
pipeline.py +388 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+demo.png filter=lfs diff=lfs merge=lfs -text

ADM-G-512/classifier/__pycache__/classifier_adm.cpython-312.pyc ADDED Viewed

Binary file (6.37 kB). View file

ADM-G-512/classifier/classifier_adm.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from modeling_adm import create_adm_classifier_model
+@dataclass
+class ADMClassifierOutput(BaseOutput):
+    """
+    Output of the ADM noisy image classifier.
+    Args:
+        logits (`torch.Tensor` of shape `(batch_size, num_classes)`):
+            Class logits for the noisy input.
+    """
+    logits: torch.FloatTensor
+class ADMClassifierModel(ModelMixin, ConfigMixin):
+    """
+    Noisy ImageNet classifier for ADM-G classifier guidance.
+    This model predicts class labels from noisy images `x_t` and is used to compute gradients that steer
+    an unconditional ADM diffusion model toward a target class.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        image_size: int = 128,
+        classifier_width: int = 128,
+        classifier_depth: int = 2,
+        classifier_attention_resolutions: str = "32,16,8",
+        classifier_use_scale_shift_norm: bool = True,
+        classifier_resblock_updown: bool = True,
+        classifier_pool: str = "attention",
+        use_fp16: bool = False,
+        num_classes: int = 1000,
+    ):
+        super().__init__()
+        self.model = create_adm_classifier_model(
+            image_size=image_size,
+            classifier_width=classifier_width,
+            classifier_depth=classifier_depth,
+            classifier_attention_resolutions=classifier_attention_resolutions,
+            classifier_use_scale_shift_norm=classifier_use_scale_shift_norm,
+            classifier_resblock_updown=classifier_resblock_updown,
+            classifier_pool=classifier_pool,
+            use_fp16=use_fp16,
+            num_classes=num_classes,
+        )
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        return_dict: bool = True,
+    ) -> Union[ADMClassifierOutput, Tuple[torch.Tensor, ...]]:
+        """
+        Args:
+            sample (`torch.Tensor`):
+                Noisy image `(batch_size, 3, height, width)` in `[-1, 1]`.
+            timestep (`torch.Tensor` or `float` or `int`):
+                Diffusion timestep indices (respaced indices during ADM-G sampling).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return an [`ADMClassifierOutput`].
+        Returns:
+            [`ADMClassifierOutput`] or `tuple`:
+                Classifier logits.
+        """
+        if not torch.is_tensor(timestep):
+            timestep = torch.tensor([timestep], device=sample.device, dtype=torch.long)
+        elif timestep.ndim == 0:
+            timestep = timestep.reshape(1).to(device=sample.device)
+        if timestep.shape[0] == 1 and sample.shape[0] > 1:
+            timestep = timestep.expand(sample.shape[0])
+        logits = self.model(sample, timestep)
+        if not return_dict:
+            return (logits,)
+        return ADMClassifierOutput(logits=logits)
+    def guidance_gradient(
+        self,
+        sample: torch.Tensor,
+        timestep: torch.Tensor,
+        class_labels: torch.Tensor,
+        classifier_scale: float = 1.0,
+    ) -> torch.Tensor:
+        """
+        Compute `classifier_scale * grad_x log p(y | x_t)` for classifier guidance (ADM-G).
+        Args:
+            sample (`torch.Tensor`):
+                Current noisy sample `x_t`.
+            timestep (`torch.Tensor`):
+                Respaced diffusion timestep indices.
+            class_labels (`torch.Tensor`):
+                Target ImageNet class indices of shape `(batch_size,)`.
+            classifier_scale (`float`, *optional*, defaults to 1.0):
+                Guidance strength (OpenAI `classifier_scale`).
+        Returns:
+            `torch.Tensor`:
+                Gradient with respect to `sample`, same shape as `sample`.
+        """
+        with torch.enable_grad():
+            x_in = sample.detach().requires_grad_(True)
+            logits = self.model(x_in, timestep)
+            log_probs = F.log_softmax(logits, dim=-1)
+            selected = log_probs[torch.arange(logits.shape[0], device=logits.device), class_labels.view(-1)]
+            grad = torch.autograd.grad(selected.sum(), x_in)[0]
+        return grad * classifier_scale

ADM-G-512/classifier/config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_class_name": "ADMClassifierModel",
+  "_diffusers_version": "0.36.0",
+  "classifier_attention_resolutions": "32,16,8",
+  "classifier_depth": 2,
+  "classifier_pool": "attention",
+  "classifier_resblock_updown": true,
+  "classifier_use_scale_shift_norm": true,
+  "classifier_width": 128,
+  "image_size": 512,
+  "num_classes": 1000,
+  "use_fp16": true
+}

ADM-G-512/classifier/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7fcf8bb2545d8f93e0de2915c3144c78532c0707709ccf81366b9fbf22cb384
+size 217824392

ADM-G-512/classifier/modeling_adm.py ADDED Viewed

	@@ -0,0 +1,772 @@

+import math
+from abc import abstractmethod
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint as torch_checkpoint
+NUM_CLASSES = 1000
+def conv_nd(dims: int, *args, **kwargs):
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    if dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    if dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims: int, *args, **kwargs):
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    if dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    if dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+def normalization(channels: int):
+    return GroupNorm32(32, channels)
+def zero_module(module: nn.Module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def timestep_embedding(timesteps: torch.Tensor, dim: int, max_period: int = 10000):
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+        device=timesteps.device
+    )
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+def convert_module_to_f16(module: nn.Module):
+    if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        module.weight.data = module.weight.data.half()
+        if module.bias is not None:
+            module.bias.data = module.bias.data.half()
+def convert_module_to_f32(module: nn.Module):
+    if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        module.weight.data = module.weight.data.float()
+        if module.bias is not None:
+            module.bias.data = module.bias.data.float()
+class TimestepBlock(nn.Module):
+    @abstractmethod
+    def forward(self, x, emb):
+        raise NotImplementedError
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    def forward(self, x, emb):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            else:
+                x = layer(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest")
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(dims, self.channels, self.out_channels, 3, stride=stride, padding=1)
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class ResBlock(TimestepBlock):
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(emb_channels, 2 * self.out_channels if use_scale_shift_norm else self.out_channels),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1)
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+    def forward(self, x, emb):
+        if self.use_checkpoint and x.requires_grad:
+            return torch_checkpoint(self._forward, x, emb, use_reentrant=False)
+        return self._forward(x, emb)
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class QKVAttentionLegacy(nn.Module):
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum("bct,bcs->bts", q * scale, k * scale)
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+class QKVAttention(nn.Module):
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert channels % num_head_channels == 0
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        self.attention = QKVAttention(self.num_heads) if use_new_attention_order else QKVAttentionLegacy(self.num_heads)
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+    def forward(self, x):
+        if self.use_checkpoint and x.requires_grad:
+            return torch_checkpoint(self._forward, x, use_reentrant=False)
+        return self._forward(x)
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+class AttentionPool2d(nn.Module):
+    """CLIP-style attention pooling used by ADM noisy classifiers."""
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads_channels: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5)
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)
+        x = torch.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+class EncoderUNetModel(nn.Module):
+    """Noisy image classifier backbone for ADM-G (classifier guidance)."""
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        pool="adaptive",
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList([TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))])
+        self._feature_size = ch
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.pool = pool
+        if pool == "adaptive":
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                nn.AdaptiveAvgPool2d((1, 1)),
+                zero_module(conv_nd(dims, ch, out_channels, 1)),
+                nn.Flatten(),
+            )
+        elif pool == "attention":
+            assert num_head_channels != -1
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels),
+            )
+        elif pool == "spatial":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, out_channels),
+            )
+        elif pool == "spatial_v2":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                normalization(2048),
+                nn.SiLU(),
+                nn.Linear(2048, out_channels),
+            )
+        else:
+            raise NotImplementedError(f"Unexpected {pool} pooling")
+    def convert_to_fp16(self):
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x, timesteps):
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        results = []
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            if self.pool.startswith("spatial"):
+                results.append(h.type(x.dtype).mean(dim=(2, 3)))
+        h = self.middle_block(h, emb)
+        if self.pool.startswith("spatial"):
+            results.append(h.type(x.dtype).mean(dim=(2, 3)))
+            h = torch.cat(results, dim=-1)
+            return self.out(h)
+        h = h.type(x.dtype)
+        return self.out(h)
+class UNetModel(nn.Module):
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        self.model_channels = model_channels
+        self.num_classes = num_classes
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        if self.num_classes is not None:
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+        ch = input_ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList([TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))])
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(model_channels * mult),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(model_channels * mult)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads_upsample,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, input_ch, out_channels, 3, padding=1)),
+        )
+    def convert_to_fp16(self):
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.output_blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.output_blocks.apply(convert_module_to_f32)
+    def forward(self, x, timesteps, y: Optional[torch.Tensor] = None):
+        assert (y is not None) == (self.num_classes is not None)
+        hs = []
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        if self.num_classes is not None:
+            assert y.shape == (x.shape[0],)
+            emb = emb + self.label_emb(y)
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            hs.append(h)
+        h = self.middle_block(h, emb)
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb)
+        h = h.type(x.dtype)
+        return self.out(h)
+def _default_channel_mult(image_size: int):
+    if image_size == 512:
+        return (0.5, 1, 1, 2, 2, 4, 4)
+    if image_size == 256:
+        return (1, 1, 2, 2, 4, 4)
+    if image_size == 128:
+        return (1, 1, 2, 3, 4)
+    if image_size == 64:
+        return (1, 2, 3, 4)
+    raise ValueError(f"unsupported image size: {image_size}")
+def create_adm_unet_model(
+    image_size,
+    num_channels,
+    num_res_blocks,
+    channel_mult="",
+    learn_sigma=False,
+    class_cond=False,
+    use_checkpoint=False,
+    attention_resolutions="16",
+    num_heads=1,
+    num_head_channels=-1,
+    num_heads_upsample=-1,
+    use_scale_shift_norm=False,
+    dropout=0.0,
+    resblock_updown=False,
+    use_fp16=False,
+    use_new_attention_order=False,
+):
+    channel_mult = _default_channel_mult(image_size) if channel_mult == "" else tuple(int(v) for v in channel_mult.split(","))
+    attention_ds = tuple(image_size // int(res) for res in attention_resolutions.split(","))
+    return UNetModel(
+        image_size=image_size,
+        in_channels=3,
+        model_channels=num_channels,
+        out_channels=(3 if not learn_sigma else 6),
+        num_res_blocks=num_res_blocks,
+        attention_resolutions=attention_ds,
+        dropout=dropout,
+        channel_mult=channel_mult,
+        num_classes=(NUM_CLASSES if class_cond else None),
+        use_checkpoint=use_checkpoint,
+        use_fp16=use_fp16,
+        num_heads=num_heads,
+        num_head_channels=num_head_channels,
+        num_heads_upsample=num_heads_upsample,
+        use_scale_shift_norm=use_scale_shift_norm,
+        resblock_updown=resblock_updown,
+        use_new_attention_order=use_new_attention_order,
+    )
+def create_adm_classifier_model(
+    image_size: int,
+    classifier_width: int = 128,
+    classifier_depth: int = 2,
+    classifier_attention_resolutions: str = "32,16,8",
+    classifier_use_scale_shift_norm: bool = True,
+    classifier_resblock_updown: bool = True,
+    classifier_pool: str = "attention",
+    use_fp16: bool = False,
+    num_classes: int = NUM_CLASSES,
+):
+    channel_mult = _default_channel_mult(image_size)
+    attention_ds = tuple(image_size // int(res) for res in classifier_attention_resolutions.split(","))
+    return EncoderUNetModel(
+        image_size=image_size,
+        in_channels=3,
+        model_channels=classifier_width,
+        out_channels=num_classes,
+        num_res_blocks=classifier_depth,
+        attention_resolutions=attention_ds,
+        channel_mult=channel_mult,
+        use_fp16=use_fp16,
+        num_head_channels=64,
+        use_scale_shift_norm=classifier_use_scale_shift_norm,
+        resblock_updown=classifier_resblock_updown,
+        pool=classifier_pool,
+    )

ADM-G-512/scheduler/__pycache__/scheduling_adm.cpython-312.pyc ADDED Viewed

Binary file (33.7 kB). View file

ADM-G-512/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "_class_name": "ADMScheduler",
+  "_diffusers_version": "0.36.0",
+  "learn_sigma": true,
+  "noise_schedule": "linear",
+  "predict_xstart": false,
+  "rescale_timesteps": false,
+  "sigma_small": false,
+  "steps": 1000,
+  "timestep_respacing": ""
+}

ADM-G-512/scheduler/scheduling_adm.py ADDED Viewed

	@@ -0,0 +1,590 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+import enum
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import BaseOutput
+try:
+    from diffusers.utils.torch_utils import randn_tensor
+except ImportError:  # pragma: no cover
+    def randn_tensor(shape, generator=None, device=None, dtype=None):
+        return torch.randn(shape, generator=generator, device=device, dtype=dtype)
+# ---------------------------------------------------------------------------
+# Internal diffusion math (OpenAI ADM / improved-diffusion)
+# ---------------------------------------------------------------------------
+def _randn_like(tensor: torch.Tensor, generator: Optional[torch.Generator] = None) -> torch.Tensor:
+    return randn_tensor(tensor.shape, generator=generator, device=tensor.device, dtype=tensor.dtype)
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    res = torch.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res.expand(broadcast_shape)
+def _get_named_beta_schedule(schedule_name: str, num_diffusion_timesteps: int):
+    if schedule_name == "linear":
+        scale = 1000 / num_diffusion_timesteps
+        return np.linspace(scale * 0.0001, scale * 0.02, num_diffusion_timesteps, dtype=np.float64)
+    if schedule_name == "cosine":
+        return _betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def _betas_for_alpha_bar(num_diffusion_timesteps: int, alpha_bar, max_beta: float = 0.999):
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+def _space_timesteps(num_timesteps: int, section_counts):
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride")
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(f"cannot divide section of {size} steps into {section_count}")
+        frac_stride = 1 if section_count <= 1 else (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        for _ in range(section_count):
+            all_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        start_idx += size
+    return set(all_steps)
+class _ModelMeanType(enum.Enum):
+    PREVIOUS_X = enum.auto()
+    START_X = enum.auto()
+    EPSILON = enum.auto()
+class _ModelVarType(enum.Enum):
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class _GaussianDiffusion:
+    def __init__(self, *, betas, model_mean_type, model_var_type, rescale_timesteps: bool = False):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.rescale_timesteps = rescale_timesteps
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.posterior_log_variance_clipped = np.log(np.append(self.posterior_variance[1], self.posterior_variance[1:]))
+        self.posterior_mean_coef1 = betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        return _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - _extract_into_tensor(
+            self.sqrt_recipm1_alphas_cumprod, t, x_t.shape
+        ) * eps
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def _predict_xstart_from_xprev(self, x_t, t, xprev):
+        return _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev - _extract_into_tensor(
+            self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape
+        ) * x_t
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        posterior_mean = _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + _extract_into_tensor(
+            self.posterior_mean_coef2, t, x_t.shape
+        ) * x_t
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance_from_output(
+        self,
+        model_output: torch.Tensor,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        clip_denoised: bool = True,
+    ):
+        _, c = x.shape[:2]
+        if self.model_var_type == _ModelVarType.LEARNED_RANGE:
+            model_output, model_var_values = torch.split(model_output, c, dim=1)
+            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+            frac = (model_var_values + 1) / 2
+            model_log_variance = frac * max_log + (1 - frac) * min_log
+            model_variance = torch.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                _ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                _ModelVarType.FIXED_SMALL: (self.posterior_variance, self.posterior_log_variance_clipped),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        if self.model_mean_type == _ModelMeanType.START_X:
+            pred_xstart = model_output
+        elif self.model_mean_type == _ModelMeanType.EPSILON:
+            pred_xstart = self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+        else:
+            pred_xstart = self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)
+        if clip_denoised:
+            pred_xstart = pred_xstart.clamp(-1, 1)
+        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+        return {"mean": model_mean, "variance": model_variance, "log_variance": model_log_variance, "pred_xstart": pred_xstart}
+    def p_mean_variance(self, model, x, t, clip_denoised: bool = True, model_kwargs=None):
+        model_kwargs = {} if model_kwargs is None else model_kwargs
+        if self.rescale_timesteps:
+            ts = t.float() * (1000.0 / self.num_timesteps)
+        else:
+            ts = t
+        model_output = model(x, ts, **model_kwargs)
+        return self.p_mean_variance_from_output(model_output, x, t, clip_denoised=clip_denoised)
+    def condition_mean(self, cond_grad: torch.Tensor, p_mean_var: dict, x: torch.Tensor) -> torch.Tensor:
+        """Apply classifier guidance to the reverse-process mean (Sohl-Dickstein et al., 2015)."""
+        del x
+        return p_mean_var["mean"].float() + p_mean_var["variance"] * cond_grad.float()
+    def p_sample_from_output(
+        self,
+        model_output: torch.Tensor,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        clip_denoised: bool = True,
+        generator: Optional[torch.Generator] = None,
+        cond_grad: Optional[torch.Tensor] = None,
+    ):
+        out = self.p_mean_variance_from_output(model_output, x, t, clip_denoised=clip_denoised)
+        if cond_grad is not None:
+            out["mean"] = self.condition_mean(cond_grad, out, x)
+        noise = _randn_like(x, generator=generator)
+        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        sample = out["mean"] + nonzero_mask * torch.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample(self, model, x, t, clip_denoised=True, model_kwargs=None, generator: Optional[torch.Generator] = None):
+        out = self.p_mean_variance(model, x, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs)
+        noise = _randn_like(x, generator=generator)
+        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        sample = out["mean"] + nonzero_mask * torch.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(self, model, shape, noise=None, clip_denoised=True, model_kwargs=None, device=None, progress=False):
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model, shape, noise=noise, clip_denoised=clip_denoised, model_kwargs=model_kwargs, device=device, progress=progress
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(self, model, shape, noise=None, clip_denoised=True, model_kwargs=None, device=None, progress=False):
+        if device is None:
+            device = next(model.parameters()).device
+        img = noise if noise is not None else torch.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = torch.tensor([i] * shape[0], device=device)
+            with torch.no_grad():
+                out = self.p_sample(model, img, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs)
+                yield out
+                img = out["sample"]
+    def condition_score(self, cond_grad: torch.Tensor, p_mean_var: dict, x: torch.Tensor, t: torch.Tensor) -> dict:
+        """Apply classifier guidance to the score (Song et al., 2020) for DDIM."""
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_grad
+        out = dict(p_mean_var)
+        out["pred_xstart"] = self._predict_xstart_from_eps(x_t=x, t=t, eps=eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+        return out
+    def ddim_sample_from_output(
+        self,
+        model_output: torch.Tensor,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        clip_denoised: bool = True,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        cond_grad: Optional[torch.Tensor] = None,
+    ):
+        out = self.p_mean_variance_from_output(model_output, x, t, clip_denoised=clip_denoised)
+        if cond_grad is not None:
+            out = self.condition_score(cond_grad, out, x, t)
+        pred_xstart = out["pred_xstart"]
+        eps = self._predict_eps_from_xstart(x, t, pred_xstart)
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = eta * torch.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * torch.sqrt(1 - alpha_bar / alpha_bar_prev)
+        noise = _randn_like(x, generator=generator)
+        mean_pred = pred_xstart * torch.sqrt(alpha_bar_prev) + torch.sqrt(1 - alpha_bar_prev - sigma**2) * eps
+        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": pred_xstart}
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        model_kwargs=None,
+        eta=0.0,
+        generator: Optional[torch.Generator] = None,
+    ):
+        model_kwargs = {} if model_kwargs is None else model_kwargs
+        if self.rescale_timesteps:
+            ts = t.float() * (1000.0 / self.num_timesteps)
+        else:
+            ts = t
+        model_output = model(x, ts, **model_kwargs)
+        return self.ddim_sample_from_output(
+            model_output, x, t, clip_denoised=clip_denoised, eta=eta, generator=generator
+        )
+class _WrappedModel:
+    def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = torch.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        if self.rescale_timesteps:
+            new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)
+class _SpacedDiffusion(_GaussianDiffusion):
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = _GaussianDiffusion(**kwargs)
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(self, model, *args, **kwargs):
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(model, self.timestep_map, self.rescale_timesteps, self.original_num_steps)
+def _create_spaced_diffusion(
+    *,
+    steps: int = 1000,
+    learn_sigma: bool = False,
+    sigma_small: bool = False,
+    noise_schedule: str = "linear",
+    predict_xstart: bool = False,
+    rescale_timesteps: bool = False,
+    timestep_respacing: str = "",
+) -> _SpacedDiffusion:
+    betas = _get_named_beta_schedule(noise_schedule, steps)
+    if not timestep_respacing:
+        timestep_respacing = [steps]
+    return _SpacedDiffusion(
+        use_timesteps=_space_timesteps(steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=_ModelMeanType.EPSILON if not predict_xstart else _ModelMeanType.START_X,
+        model_var_type=(_ModelVarType.FIXED_LARGE if not sigma_small else _ModelVarType.FIXED_SMALL)
+        if not learn_sigma
+        else _ModelVarType.LEARNED_RANGE,
+        rescale_timesteps=rescale_timesteps,
+    )
+# ---------------------------------------------------------------------------
+# Public Diffusers scheduler API
+# ---------------------------------------------------------------------------
+@dataclass
+class ADMSchedulerOutput(BaseOutput):
+    """
+    Output class for the ADM scheduler's `step` function.
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Computed sample `(x_{t-1})` of the previous timestep. `prev_sample` should be used as the next model input.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
+            The predicted denoised sample `(x_{0})` based on the model output.
+    """
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+class ADMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    DDPM / DDIM scheduler for ADM (Ablated Diffusion Model) with OpenAI-style Gaussian diffusion.
+    This scheduler implements spaced diffusion used by ADM checkpoints. Call `set_timesteps` before inference, then
+    alternate UNet forward passes with `step`.
+    """
+    config_name = "scheduler_config.json"
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        steps: int = 1000,
+        learn_sigma: bool = False,
+        sigma_small: bool = False,
+        noise_schedule: str = "linear",
+        predict_xstart: bool = False,
+        rescale_timesteps: bool = False,
+        timestep_respacing: str = "",
+    ):
+        self.timesteps = None
+        self.num_inference_steps = None
+        self._diffusion: Optional[_SpacedDiffusion] = None
+        self._use_ddim = False
+        self._eta = 0.0
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that scale the denoising model input depending on the timestep.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.Tensor`:
+                The (unchanged) input sample.
+        """
+        del timestep
+        return sample
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        device: Optional[Union[str, torch.device]] = None,
+        use_ddim: bool = False,
+        timestep_respacing: Optional[str] = None,
+    ) -> torch.Tensor:
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            use_ddim (`bool`, *optional*, defaults to `False`):
+                Whether to use DDIM sampling instead of DDPM.
+            timestep_respacing (`str`, *optional*):
+                Override for the respacing string. If `None`, respacing is derived from `num_inference_steps`.
+        Returns:
+            `torch.Tensor`:
+                Timestep indices used during denoising, in descending order.
+        """
+        if timestep_respacing is None:
+            timestep_respacing = f"ddim{num_inference_steps}" if use_ddim else str(num_inference_steps)
+        self._diffusion = _create_spaced_diffusion(
+            steps=self.config.steps,
+            learn_sigma=self.config.learn_sigma,
+            sigma_small=self.config.sigma_small,
+            noise_schedule=self.config.noise_schedule,
+            predict_xstart=self.config.predict_xstart,
+            rescale_timesteps=self.config.rescale_timesteps,
+            timestep_respacing=timestep_respacing,
+        )
+        self._use_ddim = use_ddim
+        self.num_inference_steps = num_inference_steps
+        indices = list(range(self._diffusion.num_timesteps))[::-1]
+        timesteps = torch.tensor(indices, dtype=torch.long)
+        if device is not None:
+            timesteps = timesteps.to(device)
+        self.timesteps = timesteps
+        return self.timesteps
+    def scale_timesteps_for_model(self, timestep: torch.Tensor) -> torch.Tensor:
+        """
+        Map respaced scheduler indices to the timestep embeddings expected by the ADM UNet.
+        Args:
+            timestep (`torch.Tensor`):
+                Current scheduler timestep indices of shape `(batch_size,)`.
+        Returns:
+            `torch.Tensor`:
+                Timesteps to pass to the UNet forward pass.
+        """
+        if self._diffusion is None:
+            raise ValueError("Call `set_timesteps` before running the scheduler.")
+        map_tensor = torch.tensor(self._diffusion.timestep_map, device=timestep.device, dtype=timestep.dtype)
+        model_timesteps = map_tensor[timestep]
+        if self._diffusion.rescale_timesteps:
+            model_timesteps = model_timesteps.float() * (1000.0 / self._diffusion.original_num_steps)
+        return model_timesteps
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+        clip_denoised: bool = True,
+        eta: Optional[float] = None,
+        cond_grad: Optional[torch.Tensor] = None,
+    ) -> Union[ADMSchedulerOutput, Tuple[torch.Tensor, ...]]:
+        """
+        Predict the sample at the previous timestep from the model output.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the ADM UNet.
+            timestep (`int` or `torch.Tensor`):
+                The current discrete timestep index in the respaced diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator for the sampling noise.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return an [`ADMSchedulerOutput`] instead of a plain tuple.
+            clip_denoised (`bool`, *optional*, defaults to `True`):
+                Whether to clamp the predicted `x_0` to `[-1, 1]`.
+            eta (`float`, *optional*):
+                DDIM stochasticity parameter. Only used when `use_ddim=True` was passed to `set_timesteps`.
+            cond_grad (`torch.Tensor`, *optional*):
+                Classifier guidance gradient for ADM-G (`classifier_scale * grad log p(y|x_t)`).
+        Returns:
+            [`ADMSchedulerOutput`] or `tuple`:
+                If `return_dict` is `True`, an [`ADMSchedulerOutput`] is returned, otherwise a tuple is returned where
+                the first element is the previous sample.
+        """
+        if self._diffusion is None:
+            raise ValueError("Call `set_timesteps` before `step`.")
+        if not torch.is_tensor(timestep):
+            timestep = torch.tensor([timestep], device=sample.device, dtype=torch.long)
+        elif timestep.ndim == 0:
+            timestep = timestep.reshape(1).to(device=sample.device, dtype=torch.long)
+        else:
+            timestep = timestep.to(device=sample.device, dtype=torch.long)
+        ddim_eta = self._eta if eta is None else eta
+        if self._use_ddim:
+            out = self._diffusion.ddim_sample_from_output(
+                model_output,
+                sample,
+                timestep,
+                clip_denoised=clip_denoised,
+                eta=ddim_eta,
+                generator=generator,
+                cond_grad=cond_grad,
+            )
+        else:
+            out = self._diffusion.p_sample_from_output(
+                model_output,
+                sample,
+                timestep,
+                clip_denoised=clip_denoised,
+                generator=generator,
+                cond_grad=cond_grad,
+            )
+        prev_sample = out["sample"]
+        pred_original_sample = out.get("pred_xstart")
+        if not return_dict:
+            return (prev_sample, pred_original_sample)
+        return ADMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+    def create_runtime(self, num_inference_steps: Optional[int] = None, use_ddim: bool = False) -> _SpacedDiffusion:
+        """
+        Build a spaced diffusion object for legacy loop-based sampling (`p_sample_loop`).
+        Prefer `set_timesteps` + `step` for Diffusers-style inference.
+        """
+        timestep_respacing = self.config.timestep_respacing
+        if num_inference_steps is not None:
+            timestep_respacing = f"ddim{num_inference_steps}" if use_ddim else str(num_inference_steps)
+        return _create_spaced_diffusion(
+            steps=self.config.steps,
+            learn_sigma=self.config.learn_sigma,
+            sigma_small=self.config.sigma_small,
+            noise_schedule=self.config.noise_schedule,
+            predict_xstart=self.config.predict_xstart,
+            rescale_timesteps=self.config.rescale_timesteps,
+            timestep_respacing=timestep_respacing,
+        )

ADM-G-512/unet/__pycache__/modeling_adm.cpython-312.pyc ADDED Viewed

Binary file (34.5 kB). View file

ADM-G-512/unet/__pycache__/unet_adm.cpython-312.pyc ADDED Viewed

Binary file (5.35 kB). View file

ADM-G-512/unet/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "_class_name": "ADMUNet2DModel",
+  "_diffusers_version": "0.36.0",
+  "attention_resolutions": "32,16,8",
+  "channel_mult": "",
+  "class_cond": true,
+  "dropout": 0.0,
+  "image_size": 512,
+  "in_channels": 3,
+  "learn_sigma": true,
+  "num_channels": 256,
+  "num_head_channels": 64,
+  "num_heads": 4,
+  "num_heads_upsample": -1,
+  "num_res_blocks": 2,
+  "out_channels": null,
+  "resblock_updown": true,
+  "use_checkpoint": false,
+  "use_fp16": false,
+  "use_new_attention_order": false,
+  "use_scale_shift_norm": true
+}

ADM-G-512/unet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0eca4ffe4398f1bb23765eff3c5abbbb7a8f5059ac43e5378fa912afa34c42e9
+size 2236064184

ADM-G-512/unet/modeling_adm.py ADDED Viewed

	@@ -0,0 +1,772 @@

+import math
+from abc import abstractmethod
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint as torch_checkpoint
+NUM_CLASSES = 1000
+def conv_nd(dims: int, *args, **kwargs):
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    if dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    if dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims: int, *args, **kwargs):
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    if dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    if dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+def normalization(channels: int):
+    return GroupNorm32(32, channels)
+def zero_module(module: nn.Module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def timestep_embedding(timesteps: torch.Tensor, dim: int, max_period: int = 10000):
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+        device=timesteps.device
+    )
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+def convert_module_to_f16(module: nn.Module):
+    if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        module.weight.data = module.weight.data.half()
+        if module.bias is not None:
+            module.bias.data = module.bias.data.half()
+def convert_module_to_f32(module: nn.Module):
+    if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        module.weight.data = module.weight.data.float()
+        if module.bias is not None:
+            module.bias.data = module.bias.data.float()
+class TimestepBlock(nn.Module):
+    @abstractmethod
+    def forward(self, x, emb):
+        raise NotImplementedError
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    def forward(self, x, emb):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            else:
+                x = layer(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest")
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(dims, self.channels, self.out_channels, 3, stride=stride, padding=1)
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class ResBlock(TimestepBlock):
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(emb_channels, 2 * self.out_channels if use_scale_shift_norm else self.out_channels),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1)
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+    def forward(self, x, emb):
+        if self.use_checkpoint and x.requires_grad:
+            return torch_checkpoint(self._forward, x, emb, use_reentrant=False)
+        return self._forward(x, emb)
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class QKVAttentionLegacy(nn.Module):
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum("bct,bcs->bts", q * scale, k * scale)
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+class QKVAttention(nn.Module):
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert channels % num_head_channels == 0
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        self.attention = QKVAttention(self.num_heads) if use_new_attention_order else QKVAttentionLegacy(self.num_heads)
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+    def forward(self, x):
+        if self.use_checkpoint and x.requires_grad:
+            return torch_checkpoint(self._forward, x, use_reentrant=False)
+        return self._forward(x)
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+class AttentionPool2d(nn.Module):
+    """CLIP-style attention pooling used by ADM noisy classifiers."""
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads_channels: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5)
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)
+        x = torch.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+class EncoderUNetModel(nn.Module):
+    """Noisy image classifier backbone for ADM-G (classifier guidance)."""
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        pool="adaptive",
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList([TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))])
+        self._feature_size = ch
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.pool = pool
+        if pool == "adaptive":
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                nn.AdaptiveAvgPool2d((1, 1)),
+                zero_module(conv_nd(dims, ch, out_channels, 1)),
+                nn.Flatten(),
+            )
+        elif pool == "attention":
+            assert num_head_channels != -1
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels),
+            )
+        elif pool == "spatial":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, out_channels),
+            )
+        elif pool == "spatial_v2":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                normalization(2048),
+                nn.SiLU(),
+                nn.Linear(2048, out_channels),
+            )
+        else:
+            raise NotImplementedError(f"Unexpected {pool} pooling")
+    def convert_to_fp16(self):
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x, timesteps):
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        results = []
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            if self.pool.startswith("spatial"):
+                results.append(h.type(x.dtype).mean(dim=(2, 3)))
+        h = self.middle_block(h, emb)
+        if self.pool.startswith("spatial"):
+            results.append(h.type(x.dtype).mean(dim=(2, 3)))
+            h = torch.cat(results, dim=-1)
+            return self.out(h)
+        h = h.type(x.dtype)
+        return self.out(h)
+class UNetModel(nn.Module):
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        self.model_channels = model_channels
+        self.num_classes = num_classes
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        if self.num_classes is not None:
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+        ch = input_ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList([TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))])
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(model_channels * mult),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(model_channels * mult)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads_upsample,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, input_ch, out_channels, 3, padding=1)),
+        )
+    def convert_to_fp16(self):
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.output_blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.output_blocks.apply(convert_module_to_f32)
+    def forward(self, x, timesteps, y: Optional[torch.Tensor] = None):
+        assert (y is not None) == (self.num_classes is not None)
+        hs = []
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        if self.num_classes is not None:
+            assert y.shape == (x.shape[0],)
+            emb = emb + self.label_emb(y)
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            hs.append(h)
+        h = self.middle_block(h, emb)
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb)
+        h = h.type(x.dtype)
+        return self.out(h)
+def _default_channel_mult(image_size: int):
+    if image_size == 512:
+        return (0.5, 1, 1, 2, 2, 4, 4)
+    if image_size == 256:
+        return (1, 1, 2, 2, 4, 4)
+    if image_size == 128:
+        return (1, 1, 2, 3, 4)
+    if image_size == 64:
+        return (1, 2, 3, 4)
+    raise ValueError(f"unsupported image size: {image_size}")
+def create_adm_unet_model(
+    image_size,
+    num_channels,
+    num_res_blocks,
+    channel_mult="",
+    learn_sigma=False,
+    class_cond=False,
+    use_checkpoint=False,
+    attention_resolutions="16",
+    num_heads=1,
+    num_head_channels=-1,
+    num_heads_upsample=-1,
+    use_scale_shift_norm=False,
+    dropout=0.0,
+    resblock_updown=False,
+    use_fp16=False,
+    use_new_attention_order=False,
+):
+    channel_mult = _default_channel_mult(image_size) if channel_mult == "" else tuple(int(v) for v in channel_mult.split(","))
+    attention_ds = tuple(image_size // int(res) for res in attention_resolutions.split(","))
+    return UNetModel(
+        image_size=image_size,
+        in_channels=3,
+        model_channels=num_channels,
+        out_channels=(3 if not learn_sigma else 6),
+        num_res_blocks=num_res_blocks,
+        attention_resolutions=attention_ds,
+        dropout=dropout,
+        channel_mult=channel_mult,
+        num_classes=(NUM_CLASSES if class_cond else None),
+        use_checkpoint=use_checkpoint,
+        use_fp16=use_fp16,
+        num_heads=num_heads,
+        num_head_channels=num_head_channels,
+        num_heads_upsample=num_heads_upsample,
+        use_scale_shift_norm=use_scale_shift_norm,
+        resblock_updown=resblock_updown,
+        use_new_attention_order=use_new_attention_order,
+    )
+def create_adm_classifier_model(
+    image_size: int,
+    classifier_width: int = 128,
+    classifier_depth: int = 2,
+    classifier_attention_resolutions: str = "32,16,8",
+    classifier_use_scale_shift_norm: bool = True,
+    classifier_resblock_updown: bool = True,
+    classifier_pool: str = "attention",
+    use_fp16: bool = False,
+    num_classes: int = NUM_CLASSES,
+):
+    channel_mult = _default_channel_mult(image_size)
+    attention_ds = tuple(image_size // int(res) for res in classifier_attention_resolutions.split(","))
+    return EncoderUNetModel(
+        image_size=image_size,
+        in_channels=3,
+        model_channels=classifier_width,
+        out_channels=num_classes,
+        num_res_blocks=classifier_depth,
+        attention_resolutions=attention_ds,
+        channel_mult=channel_mult,
+        use_fp16=use_fp16,
+        num_head_channels=64,
+        use_scale_shift_norm=classifier_use_scale_shift_norm,
+        resblock_updown=classifier_resblock_updown,
+        pool=classifier_pool,
+    )

ADM-G-512/unet/unet_adm.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from modeling_adm import create_adm_unet_model
+@dataclass
+class ADMUNetOutput(BaseOutput):
+    """
+    Output of the ADM UNet model.
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size, out_channels, height, width)`):
+            The denoised or noise-predicting tensor from the UNet.
+    """
+    sample: torch.FloatTensor
+class ADMUNet2DModel(ModelMixin, ConfigMixin):
+    """
+    ADM UNet model for class-conditional image diffusion in pixel space.
+    This wraps the OpenAI ADM `UNetModel` architecture with Diffusers `ModelMixin` / `ConfigMixin` for Hub
+    serialization.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        image_size: int = 64,
+        num_channels: int = 128,
+        num_res_blocks: int = 2,
+        channel_mult: str = "",
+        learn_sigma: bool = False,
+        class_cond: bool = False,
+        use_checkpoint: bool = False,
+        attention_resolutions: str = "16,8",
+        num_heads: int = 4,
+        num_head_channels: int = -1,
+        num_heads_upsample: int = -1,
+        use_scale_shift_norm: bool = True,
+        dropout: float = 0.0,
+        resblock_updown: bool = False,
+        use_fp16: bool = False,
+        use_new_attention_order: bool = False,
+        in_channels: int = 3,
+        out_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        if out_channels is None:
+            out_channels = 6 if learn_sigma else 3
+        self.model = create_adm_unet_model(
+            image_size=image_size,
+            num_channels=num_channels,
+            num_res_blocks=num_res_blocks,
+            channel_mult=channel_mult,
+            learn_sigma=learn_sigma,
+            class_cond=class_cond,
+            use_checkpoint=use_checkpoint,
+            attention_resolutions=attention_resolutions,
+            num_heads=num_heads,
+            num_head_channels=num_head_channels,
+            num_heads_upsample=num_heads_upsample,
+            use_scale_shift_norm=use_scale_shift_norm,
+            dropout=dropout,
+            resblock_updown=resblock_updown,
+            use_fp16=use_fp16,
+            use_new_attention_order=use_new_attention_order,
+        )
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        class_labels: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[ADMUNetOutput, Tuple[torch.Tensor, ...]]:
+        """
+        Forward pass of the ADM UNet.
+        Args:
+            sample (`torch.Tensor`):
+                Noisy input tensor of shape `(batch_size, in_channels, height, width)`.
+            timestep (`torch.Tensor` or `float` or `int`):
+                Timestep indices or embeddings broadcastable to batch size.
+            class_labels (`torch.Tensor`, *optional*):
+                Class indices of shape `(batch_size,)` for class-conditional models.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return an [`ADMUNetOutput`] instead of a tuple.
+        Returns:
+            [`ADMUNetOutput`] or `tuple`:
+                If `return_dict` is `True`, an [`ADMUNetOutput`] is returned, otherwise a tuple `(sample,)`.
+        """
+        if not torch.is_tensor(timestep):
+            timestep = torch.tensor([timestep], device=sample.device, dtype=torch.long)
+        elif timestep.ndim == 0:
+            timestep = timestep.reshape(1).to(device=sample.device)
+        if timestep.shape[0] == 1 and sample.shape[0] > 1:
+            timestep = timestep.expand(sample.shape[0])
+        output = self.model(sample, timestep, y=class_labels)
+        if not return_dict:
+            return (output,)
+        return ADMUNetOutput(sample=output)

README.md ADDED Viewed

	@@ -0,0 +1,87 @@

+---
+license: mit
+library_name: diffusers
+pipeline_tag: text-to-image
+tags:
+- diffusers
+- adm
+- adm-g
+- image-generation
+- class-conditional
+widget:
+- output:
+    url: demo.png
+language:
+- en
+---
+# ADM-G-512 (Diffusers)
+OpenAI ADM-G at 512×512, converted for the custom pipeline in `src/diffusers/ADM`.
+## Demo
+![ADM-G-512 demo](demo.png)
+## Layout
+```text
+ADM-diffusers/
+├── README.md
+├── pipeline.py
+├── model_index.json
+├── demo.png
+└── ADM-G-512/
+    ├── classifier/   # 512x512_classifier.pt
+    ├── scheduler/
+    └── unet/         # 512x512_diffusion.pt
+```
+ADM-G-512 uses **both** OpenAI checkpoints together, matching [classifier_sample.py](https://github.com/openai/guided-diffusion/blob/main/scripts/classifier_sample.py):
+- `unet/` — class-conditional diffusion model (`512x512_diffusion.pt`, `class_cond=True`)
+- `classifier/` — noisy ImageNet classifier (`512x512_classifier.pt`)
+- `scheduler/` — DDPM/DDIM scheduler
+## Load
+Run from this directory:
+```python
+import sys
+from pathlib import Path
+import torch
+repo = Path(__file__).resolve().parent
+sys.path.insert(0, str(repo))
+from pipeline import ADMPipeline
+pipe = ADMPipeline.from_pretrained("ADM-G-512")
+pipe.to("cuda")
+pipe.unet.float()
+pipe.classifier.float()
+pipe.classifier.model.dtype = torch.float32
+generator = torch.Generator(device="cuda").manual_seed(42)
+images = pipe(
+    class_labels=207,
+    num_inference_steps=250,
+    use_ddim=False,
+    classifier_guidance_scale=4.0,
+    generator=generator,
+).images
+images[0].save("demo.png")
+```
+Both the UNet and classifier receive the target class. The UNet uses embedded class conditioning; the classifier adds gradient guidance on top.
+Set `classifier_guidance_scale=0.0` to disable classifier guidance and sample from the base class-conditional diffusion model only.
+## Demo settings
+| Setting | Value |
+| --- | --- |
+| Class | 207 (golden retriever) |
+| Steps | 250 (DDPM) |
+| Classifier scale | 4.0 |
+| Seed | 42 |

__pycache__/pipeline.cpython-312.pyc ADDED Viewed

Binary file (16.7 kB). View file

demo.png ADDED Viewed

Git LFS Details

SHA256: e6e3afc16ac17292f33ae8f13f28145d33a882ae781a7a42c137687c8f98dea8
Pointer size: 131 Bytes
Size of remote file: 326 kB

model_index.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_class_name": "ADMPipeline",
+  "_diffusers_version": "0.36.0",
+  "scheduler": [
+    "scheduling_adm",
+    "ADMScheduler"
+  ],
+  "unet": [
+    "unet_adm",
+    "ADMUNet2DModel"
+  ],
+  "classifier": [
+    "classifier_adm",
+    "ADMClassifierModel"
+  ]
+}

pipeline.py ADDED Viewed

	@@ -0,0 +1,388 @@

+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+"""Hub custom pipeline: ADMPipeline.
+Load with native Hugging Face diffusers and `trust_remote_code=True`.
+"""
+from __future__ import annotations
+import importlib
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from tqdm.auto import tqdm
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+        >>> from pipeline import ADMPipeline
+        >>> pipe = ADMPipeline.from_pretrained("./ADM-G-512", torch_dtype=torch.float16)
+        >>> pipe.to("cuda")
+        >>> # ADM-G (classifier guidance)
+        >>> images = pipe(class_labels=207, classifier_guidance_scale=1.0, num_inference_steps=250).images
+        ```
+"""
+@dataclass
+class ADMPipelineOutput(BaseOutput):
+    """
+    Output class for ADM pipelines.
+    Args:
+        images (`torch.Tensor` or `list[PIL.Image.Image]` or `np.ndarray`):
+            Generated images of shape `(batch_size, num_channels, height, width)` when `output_type="pt"`,
+            or a list of PIL images / NumPy array when post-processed.
+    """
+    images: Union[torch.Tensor, List, np.ndarray]
+class ADMPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image generation with ADM (Ablated Diffusion Model).
+    Supports class-conditional ADM (labels embedded in the UNet) and **ADM-G** (unconditional UNet + noisy
+    classifier guidance). For ADM-G, pass `classifier_guidance_scale > 0` and provide `class_labels`; the
+    optional `classifier` predicts `p(y | x_t)` and steers sampling.
+    Args:
+        unet ([`ADMUNet2DModel`]):
+            A UNet model to denoise image samples (typically unconditional for ADM-G).
+        scheduler ([`ADMScheduler`]):
+            A scheduler used with the UNet to denoise image samples.
+        classifier ([`ADMClassifierModel`], *optional*):
+            Noisy ImageNet classifier for ADM-G guidance.
+    """
+    model_cpu_offload_seq = "classifier->unet"
+    _optional_components = ["classifier"]
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        """Load a variant folder (e.g. `./ADM-G-512`) with `unet/`, `scheduler/`, `classifier/` subfolders."""
+        repo_root = Path(__file__).resolve().parent
+        variant = Path(pretrained_model_name_or_path)
+        if not variant.is_absolute():
+            variant = (repo_root / variant).resolve()
+        model_kwargs = dict(kwargs)
+        inserted: List[str] = []
+        def _load_component(folder: str, module_name: str, class_name: str):
+            comp_dir = variant / folder
+            module_path = comp_dir / f"{module_name}.py"
+            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
+            if not module_path.exists() or not has_weights:
+                return None
+            comp_path = str(comp_dir)
+            if comp_path not in sys.path:
+                sys.path.insert(0, comp_path)
+                inserted.append(comp_path)
+            module = importlib.import_module(module_name)
+            component_cls = getattr(module, class_name)
+            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
+        try:
+            unet = _load_component("unet", "unet_adm", "ADMUNet2DModel")
+            scheduler = _load_component("scheduler", "scheduling_adm", "ADMScheduler")
+            classifier = _load_component("classifier", "classifier_adm", "ADMClassifierModel")
+            if scheduler is None:
+                sched_dir = variant / "scheduler"
+                if (sched_dir / "scheduling_adm.py").exists():
+                    sched_path = str(sched_dir)
+                    if sched_path not in sys.path:
+                        sys.path.insert(0, sched_path)
+                        inserted.append(sched_path)
+                    scheduler = importlib.import_module("scheduling_adm").ADMScheduler()
+            if unet is None and classifier is None:
+                raise ValueError(f"No loadable components found under {variant}")
+            return cls(unet=unet, scheduler=scheduler, classifier=classifier)
+        finally:
+            for comp_path in inserted:
+                if comp_path in sys.path:
+                    sys.path.remove(comp_path)
+    def __init__(
+        self,
+        unet,
+        scheduler,
+        classifier=None,
+    ):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler, classifier=classifier)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
+    @property
+    def do_classifier_guidance(self) -> bool:
+        return self.classifier is not None and getattr(self, "_classifier_guidance_scale", 0.0) > 0
+    def check_inputs(
+        self,
+        class_labels: Optional[Union[int, List[int], torch.Tensor]],
+        height: Optional[int],
+        width: Optional[int],
+    ):
+        if class_labels is None and self.unet.config.class_cond:
+            raise ValueError("`class_labels` are required for class-conditional ADM checkpoints.")
+        if class_labels is not None and self.classifier is None and not self.unet.config.class_cond:
+            raise ValueError(
+                "This checkpoint is unconditional and has no classifier. Load an ADM-G repo with a "
+                "`classifier/` subfolder, or use a class-conditional UNet."
+            )
+        if height is not None and height % 8 != 0:
+            raise ValueError(f"`height` must be divisible by 8 but is {height}.")
+        if width is not None and width % 8 != 0:
+            raise ValueError(f"`width` must be divisible by 8 but is {width}.")
+    def _prepare_class_labels(
+        self,
+        class_labels: Optional[Union[int, List[int], torch.Tensor]],
+        batch_size: int,
+        device: torch.device,
+    ) -> Optional[torch.Tensor]:
+        if class_labels is None:
+            return None
+        if isinstance(class_labels, int):
+            class_labels = [class_labels]
+        if not torch.is_tensor(class_labels):
+            class_labels = torch.tensor(class_labels, device=device, dtype=torch.long)
+        else:
+            class_labels = class_labels.to(device=device, dtype=torch.long)
+        if class_labels.shape[0] != batch_size:
+            raise ValueError(
+                f"`class_labels` batch ({class_labels.shape[0]}) must match requested batch size ({batch_size})."
+            )
+        return class_labels
+    def _get_classifier_grad(
+        self,
+        sample: torch.Tensor,
+        timestep: torch.Tensor,
+        class_labels: torch.Tensor,
+        classifier_scale: float,
+    ) -> torch.Tensor:
+        return self.classifier.guidance_gradient(
+            sample,
+            timestep,
+            class_labels,
+            classifier_scale=classifier_scale,
+        )
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Prepare initial Gaussian noise for pixel-space sampling.
+        Args:
+            batch_size (`int`):
+                Number of images to generate.
+            num_channels (`int`):
+                Number of image channels (typically 3).
+            height (`int`):
+                Image height in pixels.
+            width (`int`):
+                Image width in pixels.
+            dtype (`torch.dtype`):
+                Data type for the latent tensor.
+            device (`torch.device`):
+                Target device.
+            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
+                RNG for deterministic sampling.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noise tensor.
+        Returns:
+            `torch.Tensor`:
+                Initial noise of shape `(batch_size, num_channels, height, width)`.
+        """
+        shape = (batch_size, num_channels, height, width)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+        return latents
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        class_labels: Optional[Union[int, List[int], torch.Tensor]] = None,
+        batch_size: int = 1,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 250,
+        use_ddim: bool = False,
+        eta: float = 0.0,
+        clip_denoised: bool = True,
+        classifier_guidance_scale: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+    ) -> Union[ADMPipelineOutput, Tuple]:
+        r"""
+        Generate images with ADM.
+        Args:
+            class_labels (`int` or `list[int]` or `torch.Tensor`, *optional*):
+                ImageNet class indices. Required for class-conditional UNets and for ADM-G classifier guidance.
+            batch_size (`int`, *optional*, defaults to 1):
+                Number of images to generate when `class_labels` is not provided.
+            height (`int`, *optional*):
+                Height in pixels. Defaults to `unet.config.image_size`.
+            width (`int`, *optional*):
+                Width in pixels. Defaults to `unet.config.image_size`.
+            num_inference_steps (`int`, *optional*, defaults to 250):
+                Number of denoising steps.
+            use_ddim (`bool`, *optional*, defaults to `False`):
+                Use DDIM sampling instead of DDPM.
+            eta (`float`, *optional*, defaults to 0.0):
+                DDIM stochasticity parameter. Only used when `use_ddim=True`.
+            clip_denoised (`bool`, *optional*, defaults to `True`):
+                Clamp predicted `x_0` to `[-1, 1]` inside the scheduler.
+            classifier_guidance_scale (`float`, *optional*, defaults to 0.0):
+                ADM-G guidance strength. Values `> 0` require a loaded `classifier` (OpenAI `classifier_scale`).
+            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
+                RNG for reproducible generation.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated initial noise.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                Output format: `"pil"`, `"np"`, or `"pt"`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Return an [`ADMPipelineOutput`] instead of a tuple.
+        Examples:
+        Returns:
+            [`ADMPipelineOutput`] or `tuple`:
+                Generated images.
+        """
+        if height is None:
+            height = int(self.unet.config.image_size)
+        if width is None:
+            width = int(self.unet.config.image_size)
+        self.check_inputs(class_labels, height, width)
+        if classifier_guidance_scale > 0 and self.classifier is None:
+            raise ValueError("`classifier_guidance_scale > 0` requires a loaded `classifier` (ADM-G checkpoint).")
+        if classifier_guidance_scale > 0 and class_labels is None:
+            raise ValueError("`class_labels` are required when using classifier guidance.")
+        self._classifier_guidance_scale = classifier_guidance_scale
+        device = self._execution_device
+        model_dtype = self.unet.dtype
+        if class_labels is not None:
+            if isinstance(class_labels, int):
+                batch_size = 1
+            elif isinstance(class_labels, list):
+                batch_size = len(class_labels)
+            elif torch.is_tensor(class_labels):
+                batch_size = class_labels.shape[0]
+        class_labels = self._prepare_class_labels(class_labels, batch_size, device)
+        latents = self.prepare_latents(
+            batch_size,
+            3,
+            height,
+            width,
+            model_dtype,
+            device,
+            generator,
+            latents,
+        )
+        self.scheduler.set_timesteps(num_inference_steps, device=device, use_ddim=use_ddim)
+        self.scheduler._eta = eta
+        self._num_timesteps = len(self.scheduler.timesteps)
+        unet_class_labels = class_labels if self.unet.config.class_cond else None
+        for t in tqdm(self.scheduler.timesteps, desc="Denoising"):
+            timestep = torch.full((batch_size,), t, device=device, dtype=torch.long)
+            model_timesteps = self.scheduler.scale_timesteps_for_model(timestep)
+            model_output = self.unet(
+                latents,
+                model_timesteps,
+                class_labels=unet_class_labels,
+                return_dict=True,
+            ).sample
+            cond_grad = None
+            if self.do_classifier_guidance:
+                cond_grad = self._get_classifier_grad(
+                    latents,
+                    timestep,
+                    class_labels,
+                    classifier_guidance_scale,
+                )
+            latents = self.scheduler.step(
+                model_output,
+                t,
+                latents,
+                generator=generator,
+                clip_denoised=clip_denoised,
+                eta=eta,
+                cond_grad=cond_grad,
+            ).prev_sample
+        image = latents
+        has_nsfw_concept = None
+        if output_type == "latent":
+            image = latents
+        elif output_type == "pt":
+            image = (image / 2 + 0.5).clamp(0, 1)
+        elif output_type in ("pil", "np"):
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return ADMPipelineOutput(images=image)