BiliSakura commited on 1 day ago

Commit

7fc7e34

verified ·

1 Parent(s): 9b59af7

Upload folder using huggingface_hub

Browse files

Files changed (22) hide show

ADM-G-256/README.md +21 -17
ADM-G-256/__pycache__/pipeline.cpython-312.pyc +0 -0
ADM-G-256/classifier/__pycache__/classifier_adm.cpython-312.pyc +0 -0
ADM-G-256/classifier/__pycache__/modeling_adm.cpython-312.pyc +0 -0
ADM-G-256/classifier/classifier_adm.py +507 -1
ADM-G-256/model_index.json +2 -2
ADM-G-256/pipeline.py +167 -424
ADM-G-256/scheduler/scheduler_config.json +9 -8
ADM-G-256/unet/__pycache__/unet_adm.cpython-312.pyc +0 -0
ADM-G-256/unet/modeling_adm.py +14 -9
ADM-G-256/unet/unet_adm.py +6 -1
ADM-G-512/README.md +24 -18
ADM-G-512/__pycache__/pipeline.cpython-312.pyc +0 -0
ADM-G-512/classifier/__pycache__/classifier_adm.cpython-312.pyc +0 -0
ADM-G-512/classifier/classifier_adm.py +507 -1
ADM-G-512/demo.png +2 -2
ADM-G-512/model_index.json +2 -2
ADM-G-512/pipeline.py +167 -424
ADM-G-512/scheduler/scheduler_config.json +9 -8
ADM-G-512/unet/modeling_adm.py +14 -9
ADM-G-512/unet/unet_adm.py +6 -1
README.md +23 -42

ADM-G-256/README.md CHANGED Viewed

@@ -20,23 +20,27 @@ ADM-G-256/
 ## Load
 ```python
-import sys
 from pathlib import Path
-from huggingface_hub import snapshot_download
-repo_dir = Path(snapshot_download("BiliSakura/ADM-diffusers"))
-sys.path.insert(0, str(repo_dir / "ADM-G-256"))
-from pipeline import ADMPipeline
-pipe = ADMPipeline.from_pretrained(".")
-pipe.to("cuda")
-pipe.unet.float()
-pipe.classifier.float()
-pipe.classifier.model.dtype = torch.float32
-images = pipe(
-    class_labels=207,
     num_inference_steps=250,
-    classifier_guidance_scale=1.0,
-).images
 ```

 ## Load
 ```python
 from pathlib import Path
+import torch
+from diffusers import DDPMScheduler, DiffusionPipeline
+model_dir = Path("./BiliSakura/ADM-diffusers/ADM-G-256")
+pipe = DiffusionPipeline.from_pretrained(
+    str(model_dir),
+    local_files_only=True,
+    custom_pipeline=str(model_dir / "pipeline.py"),
+    torch_dtype=torch.bfloat16,
+)
+pipe = pipe.to("cuda")
+pipe.scheduler = DDPMScheduler.from_config(pipe.scheduler.config)
+class_id = pipe.get_label_ids("golden retriever")[0]
+generator = torch.Generator(device="cuda").manual_seed(42)
+out = pipe(
+    class_labels=class_id,
+    guidance_scale=1.0,
     num_inference_steps=250,
+    generator=generator,
+).images[0]
+out
 ```

ADM-G-256/__pycache__/pipeline.cpython-312.pyc CHANGED Viewed

Binary files a/ADM-G-256/__pycache__/pipeline.cpython-312.pyc and b/ADM-G-256/__pycache__/pipeline.cpython-312.pyc differ

ADM-G-256/classifier/__pycache__/classifier_adm.cpython-312.pyc CHANGED Viewed

Binary files a/ADM-G-256/classifier/__pycache__/classifier_adm.cpython-312.pyc and b/ADM-G-256/classifier/__pycache__/classifier_adm.cpython-312.pyc differ

ADM-G-256/classifier/__pycache__/modeling_adm.cpython-312.pyc ADDED Viewed

Binary file (41.2 kB). View file

ADM-G-256/classifier/classifier_adm.py CHANGED Viewed

@@ -3,18 +3,524 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
-from modeling_adm import create_adm_classifier_model
 @dataclass

 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
+import math
+from abc import abstractmethod
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint as torch_checkpoint
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import get_timestep_embedding
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
+NUM_CLASSES = 1000
+def conv_nd(dims: int, *args, **kwargs):
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    if dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    if dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims: int, *args, **kwargs):
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    if dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    if dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        weight = self.weight.float() if self.weight is not None else None
+        bias = self.bias.float() if self.bias is not None else None
+        y = F.group_norm(x.float(), self.num_groups, weight, bias, self.eps)
+        return y.to(dtype=x.dtype)
+def normalization(channels: int):
+    return GroupNorm32(32, channels)
+def zero_module(module: nn.Module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def convert_module_to_f16(module: nn.Module):
+    if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        module.weight.data = module.weight.data.half()
+        if module.bias is not None:
+            module.bias.data = module.bias.data.half()
+def convert_module_to_f32(module: nn.Module):
+    if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        module.weight.data = module.weight.data.float()
+        if module.bias is not None:
+            module.bias.data = module.bias.data.float()
+class TimestepBlock(nn.Module):
+    @abstractmethod
+    def forward(self, x, emb):
+        raise NotImplementedError
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    def forward(self, x, emb):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            else:
+                x = layer(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest")
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(dims, self.channels, self.out_channels, 3, stride=stride, padding=1)
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class ResBlock(TimestepBlock):
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(emb_channels, 2 * self.out_channels if use_scale_shift_norm else self.out_channels),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1)
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+    def forward(self, x, emb):
+        if self.use_checkpoint and x.requires_grad:
+            return torch_checkpoint(self._forward, x, emb, use_reentrant=False)
+        return self._forward(x, emb)
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class QKVAttentionLegacy(nn.Module):
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum("bct,bcs->bts", q * scale, k * scale)
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+class QKVAttention(nn.Module):
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert channels % num_head_channels == 0
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        self.attention = QKVAttention(self.num_heads) if use_new_attention_order else QKVAttentionLegacy(self.num_heads)
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+    def forward(self, x):
+        if self.use_checkpoint and x.requires_grad:
+            return torch_checkpoint(self._forward, x, use_reentrant=False)
+        return self._forward(x)
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+class AttentionPool2d(nn.Module):
+    """CLIP-style attention pooling used by ADM noisy classifiers."""
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads_channels: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5)
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)
+        x = torch.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+class EncoderUNetModel(nn.Module):
+    """Noisy image classifier backbone for ADM-G (classifier guidance)."""
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        pool="adaptive",
+    ):
+        super().__init__()
+        self.model_channels = model_channels
+        self.use_checkpoint = use_checkpoint
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList([TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))])
+        self._feature_size = ch
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                )
+                ch = out_ch
+                ds *= 2
+                self._feature_size += ch
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.pool = pool
+        if pool == "adaptive":
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                nn.AdaptiveAvgPool2d((1, 1)),
+                zero_module(conv_nd(dims, ch, out_channels, 1)),
+                nn.Flatten(),
+            )
+        elif pool == "attention":
+            assert num_head_channels != -1
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels),
+            )
+        elif pool == "spatial":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, out_channels),
+            )
+        elif pool == "spatial_v2":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                normalization(2048),
+                nn.SiLU(),
+                nn.Linear(2048, out_channels),
+            )
+        else:
+            raise NotImplementedError(f"Unexpected {pool} pooling")
+    def convert_to_fp16(self):
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x, timesteps):
+        emb = get_timestep_embedding(timesteps, self.model_channels).to(dtype=self.time_embed[0].weight.dtype)
+        emb = self.time_embed(emb)
+        results = []
+        h = x.to(dtype=self.time_embed[0].weight.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            if self.pool.startswith("spatial"):
+                results.append(h.to(dtype=self.time_embed[0].weight.dtype).mean(dim=(2, 3)))
+        h = self.middle_block(h, emb)
+        if self.pool.startswith("spatial"):
+            results.append(h.to(dtype=self.time_embed[0].weight.dtype).mean(dim=(2, 3)))
+            h = torch.cat(results, dim=-1)
+            return self.out(h)
+        h = h.to(dtype=self.time_embed[0].weight.dtype)
+        return self.out(h)
+def _default_channel_mult(image_size: int):
+    if image_size == 512:
+        return (0.5, 1, 1, 2, 2, 4, 4)
+    if image_size == 256:
+        return (1, 1, 2, 2, 4, 4)
+    if image_size == 128:
+        return (1, 1, 2, 3, 4)
+    if image_size == 64:
+        return (1, 2, 3, 4)
+    raise ValueError(f"unsupported image size: {image_size}")
+def create_adm_classifier_model(
+    image_size: int,
+    classifier_width: int = 128,
+    classifier_depth: int = 2,
+    classifier_attention_resolutions: str = "32,16,8",
+    classifier_use_scale_shift_norm: bool = True,
+    classifier_resblock_updown: bool = True,
+    classifier_pool: str = "attention",
+    use_fp16: bool = False,
+    num_classes: int = NUM_CLASSES,
+):
+    channel_mult = _default_channel_mult(image_size)
+    attention_ds = tuple(image_size // int(res) for res in classifier_attention_resolutions.split(","))
+    return EncoderUNetModel(
+        image_size=image_size,
+        in_channels=3,
+        model_channels=classifier_width,
+        out_channels=num_classes,
+        num_res_blocks=classifier_depth,
+        attention_resolutions=attention_ds,
+        channel_mult=channel_mult,
+        use_fp16=use_fp16,
+        num_head_channels=64,
+        use_scale_shift_norm=classifier_use_scale_shift_norm,
+        resblock_updown=classifier_resblock_updown,
+        pool=classifier_pool,
+    )
 @dataclass

ADM-G-256/model_index.json CHANGED Viewed

@@ -2,8 +2,8 @@
   "_class_name": "ADMPipeline",
   "_diffusers_version": "0.36.0",
   "scheduler": [
-    "scheduling_adm",
-    "ADMScheduler"
   ],
   "unet": [
     "unet_adm",

   "_class_name": "ADMPipeline",
   "_diffusers_version": "0.36.0",
   "scheduler": [
+    "diffusers",
+    "DDPMScheduler"
   ],
   "unet": [
     "unet_adm",

ADM-G-256/pipeline.py CHANGED Viewed

@@ -2,208 +2,72 @@
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
-"""Hub custom pipeline: ADMPipeline.
-Load with native Hugging Face diffusers and `trust_remote_code=True`.
-"""
-from __future__ import annotations
-import importlib
-import sys
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
-import numpy as np
 import torch
-from tqdm.auto import tqdm
 from diffusers.image_processor import VaeImageProcessor
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.utils import BaseOutput, replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> import torch
         >>> from diffusers import DiffusionPipeline
-        >>> from pipeline import ADMPipeline
-        >>> pipe = ADMPipeline.from_pretrained(".")
-        >>> pipe.to("cuda")
-        >>> # ADM-G (classifier guidance) with numeric class id
-        >>> images = pipe(class_labels=207, classifier_guidance_scale=1.0, num_inference_steps=250).images
-        >>> # Or use human-readable ImageNet labels (English)
-        >>> pipe.id2label[207]
-        >>> class_ids = pipe.get_label_ids("golden retriever")
-        >>> images = pipe(class_labels="golden retriever", classifier_guidance_scale=1.0).images
         ```
 """
-@dataclass
-class ADMPipelineOutput(BaseOutput):
-    """
-    Output class for ADM pipelines.
-    Args:
-        images (`torch.Tensor` or `list[PIL.Image.Image]` or `np.ndarray`):
-            Generated images of shape `(batch_size, num_channels, height, width)` when `output_type="pt"`,
-            or a list of PIL images / NumPy array when post-processed.
-    """
-    images: Union[torch.Tensor, List, np.ndarray]
 class ADMPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for image generation with ADM (Ablated Diffusion Model).
-    Supports class-conditional ADM (labels embedded in the UNet) and **ADM-G** (unconditional UNet + noisy
-    classifier guidance). For ADM-G, pass `classifier_guidance_scale > 0` and provide `class_labels`; the
-    optional `classifier` predicts `p(y | x_t)` and steers sampling.
-    Args:
-        unet ([`ADMUNet2DModel`]):
-            A UNet model to denoise image samples (typically unconditional for ADM-G).
-        scheduler ([`ADMScheduler`]):
-            A scheduler used with the UNet to denoise image samples.
-        classifier ([`ADMClassifierModel`], *optional*):
-            Noisy ImageNet classifier for ADM-G guidance.
-        id2label (`dict[int, str]`, *optional*):
-            ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
-    """
     model_cpu_offload_seq = "classifier->unet"
     _optional_components = ["classifier"]
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
-        """Load a self-contained variant folder locally or from the Hub.
-        Examples:
-            ADMPipeline.from_pretrained(".")
-            ADMPipeline.from_pretrained("./ADM-G-256")
-            ADMPipeline.from_pretrained("BiliSakura/ADM-diffusers", subfolder="ADM-G-512")
-        """
-        repo_root = Path(__file__).resolve().parent
-        if pretrained_model_name_or_path in (None, "", "."):
-            variant = repo_root
-        elif (
-            isinstance(pretrained_model_name_or_path, str)
-            and "/" in pretrained_model_name_or_path
-            and not Path(pretrained_model_name_or_path).exists()
-        ):
-            from huggingface_hub import snapshot_download
-            hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
-            if subfolder:
-                hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
-            cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
-            variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
-        else:
-            variant = Path(pretrained_model_name_or_path)
-            if not variant.is_absolute():
-                candidate = (Path.cwd() / variant).resolve()
-                variant = candidate if candidate.exists() else (repo_root / variant).resolve()
-            if subfolder:
-                variant = variant / subfolder
-        id2label_override = kwargs.pop("id2label", None)
-        model_kwargs = dict(kwargs)
-        inserted: List[str] = []
-        def _load_component(folder: str, module_name: str, class_name: str):
-            comp_dir = variant / folder
-            module_path = comp_dir / f"{module_name}.py"
-            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
-            if not module_path.exists() or not has_weights:
-                return None
-            comp_path = str(comp_dir)
-            if comp_path not in sys.path:
-                sys.path.insert(0, comp_path)
-                inserted.append(comp_path)
-            module = importlib.import_module(module_name)
-            component_cls = getattr(module, class_name)
-            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
-        try:
-            unet = _load_component("unet", "unet_adm", "ADMUNet2DModel")
-            scheduler = _load_component("scheduler", "scheduling_adm", "ADMScheduler")
-            classifier = _load_component("classifier", "classifier_adm", "ADMClassifierModel")
-            if scheduler is None:
-                sched_dir = variant / "scheduler"
-                if (sched_dir / "scheduling_adm.py").exists():
-                    sched_path = str(sched_dir)
-                    if sched_path not in sys.path:
-                        sys.path.insert(0, sched_path)
-                        inserted.append(sched_path)
-                    scheduler = importlib.import_module("scheduling_adm").ADMScheduler()
-            if unet is None and classifier is None:
-                raise ValueError(f"No loadable components found under {variant}")
-            id2label = id2label_override
-            if id2label is None:
-                model_index_path = variant / "model_index.json"
-                if model_index_path.exists():
-                    id2label = cls._read_id2label_from_model_index(model_index_path)
-            return cls(
-                unet=unet,
-                scheduler=scheduler,
-                classifier=classifier,
-                id2label=id2label,
-            )
-        finally:
-            for comp_path in inserted:
-                if comp_path in sys.path:
-                    sys.path.remove(comp_path)
     def __init__(
         self,
         unet,
-        scheduler,
-        classifier=None,
-        id2label: Optional[Dict[Union[int, str], str]] = None,
-    ):
         super().__init__()
         self.register_modules(unet=unet, scheduler=scheduler, classifier=classifier)
         self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
-        self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
     @staticmethod
-    def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
-        if not id2label:
-            return {}
-        return {int(key): value for key, value in id2label.items()}
-    @staticmethod
-    def _read_id2label_from_model_index(model_index_path: Path) -> Optional[Dict[int, str]]:
-        import json
-        raw = json.loads(model_index_path.read_text(encoding="utf-8"))
-        id2label = raw.get("id2label")
-        if not isinstance(id2label, dict):
-            return None
-        return {int(key): value for key, value in id2label.items()}
-    @staticmethod
-    def _build_label2id(id2label: dict[int, str]) -> dict[str, int]:
-        label2id: dict[str, int] = {}
         for class_id, value in id2label.items():
             for synonym in value.split(","):
                 synonym = synonym.strip()
@@ -212,153 +76,44 @@ class ADMPipeline(DiffusionPipeline):
         return dict(sorted(label2id.items()))
     @property
-    def id2label(self) -> dict[int, str]:
-        """ImageNet class id to English label string (comma-separated synonyms)."""
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
-        r"""
-        Map ImageNet label strings to class ids.
-        Args:
-            label (`str` or `list[str]`):
-                One or more English ImageNet label strings matching a synonym in `id2label`.
-        Returns:
-            `list[int]`: Class ids for [`~ADMPipeline.__call__`].
-        """
-        label2id = self.labels
-        if not label2id:
-            raise ValueError("No English labels loaded. Ensure `id2label` exists in model_index.json.")
-        if isinstance(label, str):
-            label = [label]
-        missing = [item for item in label if item not in label2id]
         if missing:
-            preview = ", ".join(list(label2id.keys())[:8])
-            raise ValueError(
-                f"Unknown English label(s): {missing}. Example valid labels: {preview}, ..."
-            )
-        return [label2id[item] for item in label]
-    @property
-    def do_classifier_guidance(self) -> bool:
-        return self.classifier is not None and getattr(self, "_classifier_guidance_scale", 0.0) > 0
-    def _normalize_class_labels(
-        self,
-        class_labels: Optional[Union[int, str, List[Union[int, str]], torch.Tensor]],
-    ) -> Optional[Union[int, List[int], torch.Tensor]]:
-        if class_labels is None:
-            return None
-        if isinstance(class_labels, str):
-            return self.get_label_ids(class_labels)[0]
-        if isinstance(class_labels, list) and class_labels and isinstance(class_labels[0], str):
-            return self.get_label_ids(class_labels)
-        return class_labels
-    def check_inputs(
-        self,
-        class_labels: Optional[Union[int, str, List[Union[int, str]], torch.Tensor]],
-        height: Optional[int],
-        width: Optional[int],
-    ):
-        if class_labels is None and self.unet.config.class_cond:
-            raise ValueError("`class_labels` are required for class-conditional ADM checkpoints.")
-        if class_labels is not None and self.classifier is None and not self.unet.config.class_cond:
-            raise ValueError(
-                "This checkpoint is unconditional and has no classifier. Load an ADM-G repo with a "
-                "`classifier/` subfolder, or use a class-conditional UNet."
-            )
-        if height is not None and height % 8 != 0:
-            raise ValueError(f"`height` must be divisible by 8 but is {height}.")
-        if width is not None and width % 8 != 0:
-            raise ValueError(f"`width` must be divisible by 8 but is {width}.")
-    def _prepare_class_labels(
-        self,
-        class_labels: Optional[Union[int, List[int], torch.Tensor]],
-        batch_size: int,
-        device: torch.device,
-    ) -> Optional[torch.Tensor]:
-        if class_labels is None:
-            return None
-        if isinstance(class_labels, int):
-            class_labels = [class_labels]
-        if not torch.is_tensor(class_labels):
-            class_labels = torch.tensor(class_labels, device=device, dtype=torch.long)
-        else:
-            class_labels = class_labels.to(device=device, dtype=torch.long)
-        if class_labels.shape[0] != batch_size:
-            raise ValueError(
-                f"`class_labels` batch ({class_labels.shape[0]}) must match requested batch size ({batch_size})."
-            )
-        return class_labels
-    def _get_classifier_grad(
-        self,
-        sample: torch.Tensor,
-        timestep: torch.Tensor,
-        class_labels: torch.Tensor,
-        classifier_scale: float,
-    ) -> torch.Tensor:
-        return self.classifier.guidance_gradient(
-            sample,
-            timestep,
-            class_labels,
-            classifier_scale=classifier_scale,
-        )
-    def prepare_latents(
-        self,
-        batch_size: int,
-        num_channels: int,
-        height: int,
-        width: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """
-        Prepare initial Gaussian noise for pixel-space sampling.
-        Args:
-            batch_size (`int`):
-                Number of images to generate.
-            num_channels (`int`):
-                Number of image channels (typically 3).
-            height (`int`):
-                Image height in pixels.
-            width (`int`):
-                Image width in pixels.
-            dtype (`torch.dtype`):
-                Data type for the latent tensor.
-            device (`torch.device`):
-                Target device.
-            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
-                RNG for deterministic sampling.
-            latents (`torch.Tensor`, *optional*):
-                Pre-generated noise tensor.
-        Returns:
-            `torch.Tensor`:
-                Initial noise of shape `(batch_size, num_channels, height, width)`.
-        """
-        shape = (batch_size, num_channels, height, width)
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device=device, dtype=dtype)
-        return latents
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
@@ -369,142 +124,130 @@ class ADMPipeline(DiffusionPipeline):
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 250,
-        use_ddim: bool = False,
         eta: float = 0.0,
         clip_denoised: bool = True,
-        classifier_guidance_scale: float = 0.0,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.Tensor] = None,
         output_type: str = "pil",
         return_dict: bool = True,
-    ) -> Union[ADMPipelineOutput, Tuple]:
         r"""
-        Generate images with ADM.
-        Args:
-            class_labels (`int`, `str`, `list[int]`, `list[str]`, or `torch.Tensor`, *optional*):
-                ImageNet class indices or English label strings. Required for class-conditional UNets and for ADM-G
-                classifier guidance. Strings are resolved via [`~ADMPipeline.get_label_ids`].
-            batch_size (`int`, *optional*, defaults to 1):
-                Number of images to generate when `class_labels` is not provided.
-            height (`int`, *optional*):
-                Height in pixels. Defaults to `unet.config.image_size`.
-            width (`int`, *optional*):
-                Width in pixels. Defaults to `unet.config.image_size`.
-            num_inference_steps (`int`, *optional*, defaults to 250):
-                Number of denoising steps.
-            use_ddim (`bool`, *optional*, defaults to `False`):
-                Use DDIM sampling instead of DDPM.
-            eta (`float`, *optional*, defaults to 0.0):
-                DDIM stochasticity parameter. Only used when `use_ddim=True`.
-            clip_denoised (`bool`, *optional*, defaults to `True`):
-                Clamp predicted `x_0` to `[-1, 1]` inside the scheduler.
-            classifier_guidance_scale (`float`, *optional*, defaults to 0.0):
-                ADM-G guidance strength. Values `> 0` require a loaded `classifier` (OpenAI `classifier_scale`).
-            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
-                RNG for reproducible generation.
-            latents (`torch.Tensor`, *optional*):
-                Pre-generated initial noise.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                Output format: `"pil"`, `"np"`, or `"pt"`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Return an [`ADMPipelineOutput`] instead of a tuple.
         Examples:
-        Returns:
-            [`ADMPipelineOutput`] or `tuple`:
-                Generated images.
         """
-        if height is None:
-            height = int(self.unet.config.image_size)
-        if width is None:
-            width = int(self.unet.config.image_size)
-        class_labels = self._normalize_class_labels(class_labels)
-        self.check_inputs(class_labels, height, width)
-        if classifier_guidance_scale > 0 and self.classifier is None:
-            raise ValueError("`classifier_guidance_scale > 0` requires a loaded `classifier` (ADM-G checkpoint).")
-        if classifier_guidance_scale > 0 and class_labels is None:
-            raise ValueError("`class_labels` are required when using classifier guidance.")
-        self._classifier_guidance_scale = classifier_guidance_scale
         device = self._execution_device
-        model_dtype = self.unet.dtype
         if class_labels is not None:
-            if isinstance(class_labels, int):
-                batch_size = 1
-            elif isinstance(class_labels, list):
-                batch_size = len(class_labels)
-            elif torch.is_tensor(class_labels):
-                batch_size = class_labels.shape[0]
-        class_labels = self._prepare_class_labels(class_labels, batch_size, device)
-        latents = self.prepare_latents(
-            batch_size,
-            3,
-            height,
-            width,
-            model_dtype,
-            device,
-            generator,
-            latents,
-        )
-        self.scheduler.set_timesteps(num_inference_steps, device=device, use_ddim=use_ddim)
-        self.scheduler._eta = eta
-        self._num_timesteps = len(self.scheduler.timesteps)
-        unet_class_labels = class_labels if self.unet.config.class_cond else None
-        for t in tqdm(self.scheduler.timesteps, desc="Denoising"):
-            timestep = torch.full((batch_size,), t, device=device, dtype=torch.long)
-            model_timesteps = self.scheduler.scale_timesteps_for_model(timestep)
-            model_output = self.unet(
-                latents,
-                model_timesteps,
-                class_labels=unet_class_labels,
-                return_dict=True,
-            ).sample
             cond_grad = None
-            if self.do_classifier_guidance:
-                cond_grad = self._get_classifier_grad(
-                    latents,
-                    timestep,
-                    class_labels,
-                    classifier_guidance_scale,
                 )
-            latents = self.scheduler.step(
-                model_output,
-                t,
-                latents,
-                generator=generator,
-                clip_denoised=clip_denoised,
-                eta=eta,
-                cond_grad=cond_grad,
-            ).prev_sample
-        image = latents
-        has_nsfw_concept = None
-        if output_type == "latent":
-            image = latents
-        elif output_type == "pt":
-            image = (image / 2 + 0.5).clamp(0, 1)
-        elif output_type in ("pil", "np"):
-            image = (image / 2 + 0.5).clamp(0, 1)
             image = self.image_processor.postprocess(image, output_type=output_type)
         self.maybe_free_model_hooks()
         if not return_dict:
-            return (image, has_nsfw_concept)
-        return ADMPipelineOutput(images=image)

 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 import torch
 from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
+        >>> from pathlib import Path
         >>> import torch
         >>> from diffusers import DiffusionPipeline
+        >>> model_dir = Path("path/to/BiliSakura/ADM-diffusers/ADM-G-256")
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     str(model_dir),
+        ...     local_files_only=True,
+        ...     custom_pipeline=str(model_dir / "pipeline.py"),
+        ...     torch_dtype=torch.bfloat16,
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> class_id = pipe.get_label_ids("golden retriever")[0]
+        >>> image = pipe(class_labels=class_id, guidance_scale=1.0).images[0]
         ```
 """
 class ADMPipeline(DiffusionPipeline):
+    r"""ADM/ADM-G pipeline compatible with Diffusers custom pipeline loading."""
     model_cpu_offload_seq = "classifier->unet"
     _optional_components = ["classifier"]
     def __init__(
         self,
         unet,
+        scheduler: KarrasDiffusionSchedulers,
+        classifier: Optional[Any] = None,
+        id2label: Optional[Dict[str, str]] = None,
+        null_class_id: int = 1000,
+    ) -> None:
         super().__init__()
         self.register_modules(unet=unet, scheduler=scheduler, classifier=classifier)
+        self.register_to_config(null_class_id=int(null_class_id))
         self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
+        self._id2label = {int(k): v for k, v in (id2label or {}).items()}
         self.labels = self._build_label2id(self._id2label)
     @staticmethod
+    def _build_label2id(id2label: Dict[int, str]) -> Dict[str, int]:
+        label2id: Dict[str, int] = {}
         for class_id, value in id2label.items():
             for synonym in value.split(","):
                 synonym = synonym.strip()
         return dict(sorted(label2id.items()))
     @property
+    def id2label(self) -> Dict[int, str]:
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        if not self.labels:
+            raise ValueError("No id2label mapping is available in this checkpoint.")
+        labels = [label] if isinstance(label, str) else label
+        missing = [item for item in labels if item not in self.labels]
         if missing:
+            preview = ", ".join(list(self.labels.keys())[:8])
+            raise ValueError(f"Unknown labels: {missing}. Example valid labels: {preview}, ...")
+        return [self.labels[item] for item in labels]
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler: KarrasDiffusionSchedulers,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+        eta: float,
+    ) -> Dict[str, Any]:
+        kwargs: Dict[str, Any] = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "eta" in step_params:
+            kwargs["eta"] = eta
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        return kwargs
+    @staticmethod
+    def _is_ddim_like(step_params: Set[str]) -> bool:
+        return "eta" in step_params
+    @staticmethod
+    def _expand_timestep(timestep, batch: int, device: torch.device) -> torch.Tensor:
+        if not torch.is_tensor(timestep):
+            timestep = torch.tensor([timestep], dtype=torch.long, device=device)
+        elif timestep.ndim == 0:
+            timestep = timestep[None].to(device=device)
+        return timestep.expand(batch)
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 250,
+        guidance_scale: float = 1.0,
+        classifier_guidance_scale: float = 0.0,
         eta: float = 0.0,
         clip_denoised: bool = True,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.Tensor] = None,
         output_type: str = "pil",
         return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
+        Generate samples from the ADM/ADM-G checkpoint.
         Examples:
+            <!-- this section is replaced by replace_example_docstring -->
         """
+        # Stage 1: check inputs
+        if isinstance(class_labels, str):
+            class_labels = self.get_label_ids(class_labels)[0]
+        if isinstance(class_labels, list) and class_labels and isinstance(class_labels[0], str):
+            class_labels = self.get_label_ids(class_labels)
+        native_size = int(getattr(self.unet.config, "image_size", 256))
+        height = native_size if height is None else int(height)
+        width = native_size if width is None else int(width)
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"height and width must be divisible by 8, got ({height}, {width}).")
+        if output_type not in {"pil", "np", "pt", "latent"}:
+            raise ValueError(f"Unsupported output_type: {output_type}")
+        # This checkpoint does not use classifier-free guidance (CFG).
+        # Keep classifier_guidance_scale for compatibility, but treat guidance_scale
+        # as the primary classifier-guidance strength.
+        effective_classifier_guidance_scale = (
+            float(classifier_guidance_scale) if classifier_guidance_scale > 0 else float(guidance_scale)
+        )
+        if class_labels is None and (self.unet.config.class_cond or effective_classifier_guidance_scale > 0):
+            raise ValueError("class_labels are required for class-conditional sampling and ADM-G guidance.")
+        if isinstance(class_labels, int):
+            batch_size = 1
+            class_labels = [class_labels]
+        elif isinstance(class_labels, list):
+            batch_size = len(class_labels)
+        elif torch.is_tensor(class_labels):
+            batch_size = int(class_labels.shape[0])
+        # Stage 2: define call parameters
         device = self._execution_device
+        channels = int(getattr(self.unet.config, "in_channels", 3))
+        dtype = self.unet.dtype
+        # Stage 3: prepare class conditioning
+        class_tensor = None
+        class_input = None
         if class_labels is not None:
+            class_tensor = class_labels if torch.is_tensor(class_labels) else torch.tensor(class_labels, dtype=torch.long)
+            class_tensor = class_tensor.to(device=device, dtype=torch.long).reshape(-1)
+            if class_tensor.shape[0] != batch_size:
+                raise ValueError("class_labels batch must match requested batch_size")
+            if self.unet.config.class_cond:
+                class_input = class_tensor
+        # Stage 4: prepare timesteps
+        scheduler = self.scheduler
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        scheduler.set_timesteps(num_inference_steps, device=device)
+        # Stage 5: prepare latent variables
+        shape = (batch_size, channels, height, width)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if tuple(latents.shape) != shape:
+                raise ValueError(f"Unexpected latents shape {tuple(latents.shape)}; expected {shape}.")
+            latents = latents.to(device=device, dtype=dtype)
+        latents = latents * scheduler.init_noise_sigma
+        # Stage 6: prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(scheduler, generator, eta)
+        # Stage 7: denoising loop
+        for timestep in self.progress_bar(scheduler.timesteps):
+            model_input = latents
+            model_input = scheduler.scale_model_input(model_input, timestep)
+            timestep_input = self._expand_timestep(timestep, model_input.shape[0], model_input.device)
+            model_output = self.unet(model_input, timestep_input, class_labels=class_input, return_dict=True).sample
             cond_grad = None
+            if effective_classifier_guidance_scale > 0:
+                if self.classifier is None or class_tensor is None:
+                    raise ValueError("guidance_scale requires both classifier and class_labels.")
+                grad_t = self._expand_timestep(timestep, batch_size, latents.device)
+                cond_grad = self.classifier.guidance_gradient(
+                    latents, grad_t, class_tensor, classifier_scale=effective_classifier_guidance_scale
                 )
+            step_model_output = model_output
+            if cond_grad is not None:
+                if self._is_ddim_like(step_params):
+                    eps = model_output[:, :channels] if model_output.shape[1] == 2 * channels else model_output
+                    alpha_bar_t = scheduler.alphas_cumprod[timestep].to(device=latents.device, dtype=latents.dtype)
+                    step_model_output = eps - (1 - alpha_bar_t).sqrt() * cond_grad
+                elif hasattr(scheduler, "_get_variance"):
+                    pred_var = None
+                    if model_output.shape[1] == 2 * channels:
+                        _, pred_var = torch.split(model_output, channels, dim=1)
+                    variance = scheduler._get_variance(int(timestep), predicted_variance=pred_var)
+                    if scheduler.config.variance_type == "learned_range":
+                        variance = torch.exp(variance)
+                    latents = latents + variance * cond_grad
+                else:
+                    raise ValueError(
+                        "guidance_scale is not supported for the current scheduler. "
+                        "Use a DDPM/DDIM-compatible scheduler or disable classifier guidance."
+                    )
+            latents = scheduler.step(step_model_output, timestep, latents, return_dict=True, **extra_step_kwargs).prev_sample
+        image = latents if output_type == "latent" else (latents / 2 + 0.5).clamp(0, 1)
+        if output_type in {"pil", "np"}:
             image = self.image_processor.postprocess(image, output_type=output_type)
         self.maybe_free_model_hooks()
         if not return_dict:
+            return (image,)
+        return ImagePipelineOutput(images=image)

ADM-G-256/scheduler/scheduler_config.json CHANGED Viewed

@@ -1,11 +1,12 @@
 {
-  "_class_name": "ADMScheduler",
   "_diffusers_version": "0.36.0",
-  "learn_sigma": true,
-  "noise_schedule": "linear",
-  "predict_xstart": false,
-  "rescale_timesteps": false,
-  "sigma_small": false,
-  "steps": 1000,
-  "timestep_respacing": ""
 }

 {
+  "_class_name": "DDPMScheduler",
   "_diffusers_version": "0.36.0",
+  "num_train_timesteps": 1000,
+  "beta_start": 0.0001,
+  "beta_end": 0.02,
+  "beta_schedule": "linear",
+  "prediction_type": "epsilon",
+  "variance_type": "learned_range",
+  "clip_sample": true,
+  "timestep_spacing": "leading"
 }

ADM-G-256/unet/__pycache__/unet_adm.cpython-312.pyc CHANGED Viewed

Binary files a/ADM-G-256/unet/__pycache__/unet_adm.cpython-312.pyc and b/ADM-G-256/unet/__pycache__/unet_adm.cpython-312.pyc differ

ADM-G-256/unet/modeling_adm.py CHANGED Viewed

@@ -37,7 +37,10 @@ def avg_pool_nd(dims: int, *args, **kwargs):
 class GroupNorm32(nn.GroupNorm):
     def forward(self, x):
-        return super().forward(x.float()).type(x.dtype)
 def normalization(channels: int):
@@ -475,19 +478,20 @@ class EncoderUNetModel(nn.Module):
         self.middle_block.apply(convert_module_to_f32)
     def forward(self, x, timesteps):
-        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
         results = []
-        h = x.type(self.dtype)
         for module in self.input_blocks:
             h = module(h, emb)
             if self.pool.startswith("spatial"):
-                results.append(h.type(x.dtype).mean(dim=(2, 3)))
         h = self.middle_block(h, emb)
         if self.pool.startswith("spatial"):
-            results.append(h.type(x.dtype).mean(dim=(2, 3)))
             h = torch.cat(results, dim=-1)
             return self.out(h)
-        h = h.type(x.dtype)
         return self.out(h)
@@ -673,12 +677,13 @@ class UNetModel(nn.Module):
     def forward(self, x, timesteps, y: Optional[torch.Tensor] = None):
         assert (y is not None) == (self.num_classes is not None)
         hs = []
-        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
         if self.num_classes is not None:
             assert y.shape == (x.shape[0],)
             emb = emb + self.label_emb(y)
-        h = x.type(self.dtype)
         for module in self.input_blocks:
             h = module(h, emb)
             hs.append(h)
@@ -686,7 +691,7 @@ class UNetModel(nn.Module):
         for module in self.output_blocks:
             h = torch.cat([h, hs.pop()], dim=1)
             h = module(h, emb)
-        h = h.type(x.dtype)
         return self.out(h)

 class GroupNorm32(nn.GroupNorm):
     def forward(self, x):
+        weight = self.weight.float() if self.weight is not None else None
+        bias = self.bias.float() if self.bias is not None else None
+        y = F.group_norm(x.float(), self.num_groups, weight, bias, self.eps)
+        return y.to(dtype=x.dtype)
 def normalization(channels: int):
         self.middle_block.apply(convert_module_to_f32)
     def forward(self, x, timesteps):
+        emb = timestep_embedding(timesteps, self.model_channels).to(dtype=self.time_embed[0].weight.dtype)
+        emb = self.time_embed(emb)
         results = []
+        h = x.to(dtype=self.time_embed[0].weight.dtype)
         for module in self.input_blocks:
             h = module(h, emb)
             if self.pool.startswith("spatial"):
+                results.append(h.to(dtype=self.time_embed[0].weight.dtype).mean(dim=(2, 3)))
         h = self.middle_block(h, emb)
         if self.pool.startswith("spatial"):
+            results.append(h.to(dtype=self.time_embed[0].weight.dtype).mean(dim=(2, 3)))
             h = torch.cat(results, dim=-1)
             return self.out(h)
+        h = h.to(dtype=self.time_embed[0].weight.dtype)
         return self.out(h)
     def forward(self, x, timesteps, y: Optional[torch.Tensor] = None):
         assert (y is not None) == (self.num_classes is not None)
         hs = []
+        emb = timestep_embedding(timesteps, self.model_channels).to(dtype=self.time_embed[0].weight.dtype)
+        emb = self.time_embed(emb)
         if self.num_classes is not None:
             assert y.shape == (x.shape[0],)
             emb = emb + self.label_emb(y)
+        h = x.to(dtype=self.time_embed[0].weight.dtype)
         for module in self.input_blocks:
             h = module(h, emb)
             hs.append(h)
         for module in self.output_blocks:
             h = torch.cat([h, hs.pop()], dim=1)
             h = module(h, emb)
+        h = h.to(dtype=self.time_embed[0].weight.dtype)
         return self.out(h)

ADM-G-256/unet/unet_adm.py CHANGED Viewed

@@ -12,7 +12,12 @@ from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
-from modeling_adm import create_adm_unet_model
 @dataclass

 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
+try:
+    from .modeling_adm import create_adm_unet_model
+except ImportError:
+    import importlib
+    create_adm_unet_model = importlib.import_module("modeling_adm").create_adm_unet_model
 @dataclass

ADM-G-512/README.md CHANGED Viewed

@@ -10,6 +10,8 @@ Self-contained ADM-G checkpoint inside [`BiliSakura/ADM-diffusers`](https://hugg
 ![ADM-G-512 demo](demo.png)
 ## Layout
 ```text
@@ -25,23 +27,27 @@ ADM-G-512/
 ## Load
 ```python
-import sys
 from pathlib import Path
-from huggingface_hub import snapshot_download
-repo_dir = Path(snapshot_download("BiliSakura/ADM-diffusers"))
-sys.path.insert(0, str(repo_dir / "ADM-G-512"))
-from pipeline import ADMPipeline
-pipe = ADMPipeline.from_pretrained(".")
-pipe.to("cuda")
-pipe.unet.float()
-pipe.classifier.float()
-pipe.classifier.model.dtype = torch.float32
-images = pipe(
-    class_labels=207,
-    num_inference_steps=250,
-    classifier_guidance_scale=4.0,
-).images
 ```

 ![ADM-G-512 demo](demo.png)
+Settings used for this demo image: `ADM-G-512`, `DDIMScheduler`, `num_inference_steps=50`, `guidance_scale=4.0`, `seed=42`, class `"golden retriever"`.
 ## Layout
 ```text
 ## Load
 ```python
 from pathlib import Path
+import torch
+from diffusers import DDIMScheduler, DiffusionPipeline
+model_dir = Path("./BiliSakura/ADM-diffusers/ADM-G-512")
+pipe = DiffusionPipeline.from_pretrained(
+    str(model_dir),
+    local_files_only=True,
+    custom_pipeline=str(model_dir / "pipeline.py"),
+    torch_dtype=torch.bfloat16,
+)
+pipe = pipe.to("cuda")
+pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+class_id = pipe.get_label_ids("golden retriever")[0]
+generator = torch.Generator(device="cuda").manual_seed(42)
+out = pipe(
+    class_labels=class_id,
+    guidance_scale=4.0,
+    num_inference_steps=50,
+    generator=generator,
+).images[0]
+out
 ```

ADM-G-512/__pycache__/pipeline.cpython-312.pyc CHANGED Viewed

Binary files a/ADM-G-512/__pycache__/pipeline.cpython-312.pyc and b/ADM-G-512/__pycache__/pipeline.cpython-312.pyc differ

ADM-G-512/classifier/__pycache__/classifier_adm.cpython-312.pyc CHANGED Viewed

Binary files a/ADM-G-512/classifier/__pycache__/classifier_adm.cpython-312.pyc and b/ADM-G-512/classifier/__pycache__/classifier_adm.cpython-312.pyc differ

ADM-G-512/classifier/classifier_adm.py CHANGED Viewed

@@ -3,18 +3,524 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
-from modeling_adm import create_adm_classifier_model
 @dataclass

 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
+import math
+from abc import abstractmethod
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint as torch_checkpoint
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import get_timestep_embedding
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
+NUM_CLASSES = 1000
+def conv_nd(dims: int, *args, **kwargs):
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    if dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    if dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims: int, *args, **kwargs):
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    if dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    if dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        weight = self.weight.float() if self.weight is not None else None
+        bias = self.bias.float() if self.bias is not None else None
+        y = F.group_norm(x.float(), self.num_groups, weight, bias, self.eps)
+        return y.to(dtype=x.dtype)
+def normalization(channels: int):
+    return GroupNorm32(32, channels)
+def zero_module(module: nn.Module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def convert_module_to_f16(module: nn.Module):
+    if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        module.weight.data = module.weight.data.half()
+        if module.bias is not None:
+            module.bias.data = module.bias.data.half()
+def convert_module_to_f32(module: nn.Module):
+    if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        module.weight.data = module.weight.data.float()
+        if module.bias is not None:
+            module.bias.data = module.bias.data.float()
+class TimestepBlock(nn.Module):
+    @abstractmethod
+    def forward(self, x, emb):
+        raise NotImplementedError
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    def forward(self, x, emb):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            else:
+                x = layer(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest")
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(dims, self.channels, self.out_channels, 3, stride=stride, padding=1)
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class ResBlock(TimestepBlock):
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(emb_channels, 2 * self.out_channels if use_scale_shift_norm else self.out_channels),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1)
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+    def forward(self, x, emb):
+        if self.use_checkpoint and x.requires_grad:
+            return torch_checkpoint(self._forward, x, emb, use_reentrant=False)
+        return self._forward(x, emb)
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class QKVAttentionLegacy(nn.Module):
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum("bct,bcs->bts", q * scale, k * scale)
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+class QKVAttention(nn.Module):
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = torch.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = torch.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert channels % num_head_channels == 0
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        self.attention = QKVAttention(self.num_heads) if use_new_attention_order else QKVAttentionLegacy(self.num_heads)
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+    def forward(self, x):
+        if self.use_checkpoint and x.requires_grad:
+            return torch_checkpoint(self._forward, x, use_reentrant=False)
+        return self._forward(x)
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+class AttentionPool2d(nn.Module):
+    """CLIP-style attention pooling used by ADM noisy classifiers."""
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads_channels: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5)
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)
+        x = torch.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+class EncoderUNetModel(nn.Module):
+    """Noisy image classifier backbone for ADM-G (classifier guidance)."""
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        pool="adaptive",
+    ):
+        super().__init__()
+        self.model_channels = model_channels
+        self.use_checkpoint = use_checkpoint
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList([TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))])
+        self._feature_size = ch
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                )
+                ch = out_ch
+                ds *= 2
+                self._feature_size += ch
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.pool = pool
+        if pool == "adaptive":
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                nn.AdaptiveAvgPool2d((1, 1)),
+                zero_module(conv_nd(dims, ch, out_channels, 1)),
+                nn.Flatten(),
+            )
+        elif pool == "attention":
+            assert num_head_channels != -1
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels),
+            )
+        elif pool == "spatial":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, out_channels),
+            )
+        elif pool == "spatial_v2":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                normalization(2048),
+                nn.SiLU(),
+                nn.Linear(2048, out_channels),
+            )
+        else:
+            raise NotImplementedError(f"Unexpected {pool} pooling")
+    def convert_to_fp16(self):
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x, timesteps):
+        emb = get_timestep_embedding(timesteps, self.model_channels).to(dtype=self.time_embed[0].weight.dtype)
+        emb = self.time_embed(emb)
+        results = []
+        h = x.to(dtype=self.time_embed[0].weight.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            if self.pool.startswith("spatial"):
+                results.append(h.to(dtype=self.time_embed[0].weight.dtype).mean(dim=(2, 3)))
+        h = self.middle_block(h, emb)
+        if self.pool.startswith("spatial"):
+            results.append(h.to(dtype=self.time_embed[0].weight.dtype).mean(dim=(2, 3)))
+            h = torch.cat(results, dim=-1)
+            return self.out(h)
+        h = h.to(dtype=self.time_embed[0].weight.dtype)
+        return self.out(h)
+def _default_channel_mult(image_size: int):
+    if image_size == 512:
+        return (0.5, 1, 1, 2, 2, 4, 4)
+    if image_size == 256:
+        return (1, 1, 2, 2, 4, 4)
+    if image_size == 128:
+        return (1, 1, 2, 3, 4)
+    if image_size == 64:
+        return (1, 2, 3, 4)
+    raise ValueError(f"unsupported image size: {image_size}")
+def create_adm_classifier_model(
+    image_size: int,
+    classifier_width: int = 128,
+    classifier_depth: int = 2,
+    classifier_attention_resolutions: str = "32,16,8",
+    classifier_use_scale_shift_norm: bool = True,
+    classifier_resblock_updown: bool = True,
+    classifier_pool: str = "attention",
+    use_fp16: bool = False,
+    num_classes: int = NUM_CLASSES,
+):
+    channel_mult = _default_channel_mult(image_size)
+    attention_ds = tuple(image_size // int(res) for res in classifier_attention_resolutions.split(","))
+    return EncoderUNetModel(
+        image_size=image_size,
+        in_channels=3,
+        model_channels=classifier_width,
+        out_channels=num_classes,
+        num_res_blocks=classifier_depth,
+        attention_resolutions=attention_ds,
+        channel_mult=channel_mult,
+        use_fp16=use_fp16,
+        num_head_channels=64,
+        use_scale_shift_norm=classifier_use_scale_shift_norm,
+        resblock_updown=classifier_resblock_updown,
+        pool=classifier_pool,
+    )
 @dataclass

ADM-G-512/demo.png CHANGED Viewed

Git LFS Details

SHA256: e6e3afc16ac17292f33ae8f13f28145d33a882ae781a7a42c137687c8f98dea8
Pointer size: 131 Bytes
Size of remote file: 326 kB

Git LFS Details

SHA256: 82ea34d28d5fe28f719a7142da3194e6cfc860db7ac51f0478dba6600e87bf56
Pointer size: 131 Bytes
Size of remote file: 300 kB

ADM-G-512/model_index.json CHANGED Viewed

@@ -2,8 +2,8 @@
   "_class_name": "ADMPipeline",
   "_diffusers_version": "0.36.0",
   "scheduler": [
-    "scheduling_adm",
-    "ADMScheduler"
   ],
   "unet": [
     "unet_adm",

   "_class_name": "ADMPipeline",
   "_diffusers_version": "0.36.0",
   "scheduler": [
+    "diffusers",
+    "DDPMScheduler"
   ],
   "unet": [
     "unet_adm",

ADM-G-512/pipeline.py CHANGED Viewed

@@ -2,208 +2,72 @@
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
-"""Hub custom pipeline: ADMPipeline.
-Load with native Hugging Face diffusers and `trust_remote_code=True`.
-"""
-from __future__ import annotations
-import importlib
-import sys
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
-import numpy as np
 import torch
-from tqdm.auto import tqdm
 from diffusers.image_processor import VaeImageProcessor
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.utils import BaseOutput, replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> import torch
         >>> from diffusers import DiffusionPipeline
-        >>> from pipeline import ADMPipeline
-        >>> pipe = ADMPipeline.from_pretrained(".")
-        >>> pipe.to("cuda")
-        >>> # ADM-G (classifier guidance) with numeric class id
-        >>> images = pipe(class_labels=207, classifier_guidance_scale=1.0, num_inference_steps=250).images
-        >>> # Or use human-readable ImageNet labels (English)
-        >>> pipe.id2label[207]
-        >>> class_ids = pipe.get_label_ids("golden retriever")
-        >>> images = pipe(class_labels="golden retriever", classifier_guidance_scale=1.0).images
         ```
 """
-@dataclass
-class ADMPipelineOutput(BaseOutput):
-    """
-    Output class for ADM pipelines.
-    Args:
-        images (`torch.Tensor` or `list[PIL.Image.Image]` or `np.ndarray`):
-            Generated images of shape `(batch_size, num_channels, height, width)` when `output_type="pt"`,
-            or a list of PIL images / NumPy array when post-processed.
-    """
-    images: Union[torch.Tensor, List, np.ndarray]
 class ADMPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for image generation with ADM (Ablated Diffusion Model).
-    Supports class-conditional ADM (labels embedded in the UNet) and **ADM-G** (unconditional UNet + noisy
-    classifier guidance). For ADM-G, pass `classifier_guidance_scale > 0` and provide `class_labels`; the
-    optional `classifier` predicts `p(y | x_t)` and steers sampling.
-    Args:
-        unet ([`ADMUNet2DModel`]):
-            A UNet model to denoise image samples (typically unconditional for ADM-G).
-        scheduler ([`ADMScheduler`]):
-            A scheduler used with the UNet to denoise image samples.
-        classifier ([`ADMClassifierModel`], *optional*):
-            Noisy ImageNet classifier for ADM-G guidance.
-        id2label (`dict[int, str]`, *optional*):
-            ImageNet class id to English label mapping. Values may contain comma-separated synonyms.
-    """
     model_cpu_offload_seq = "classifier->unet"
     _optional_components = ["classifier"]
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path=None, subfolder=None, **kwargs):
-        """Load a self-contained variant folder locally or from the Hub.
-        Examples:
-            ADMPipeline.from_pretrained(".")
-            ADMPipeline.from_pretrained("./ADM-G-256")
-            ADMPipeline.from_pretrained("BiliSakura/ADM-diffusers", subfolder="ADM-G-512")
-        """
-        repo_root = Path(__file__).resolve().parent
-        if pretrained_model_name_or_path in (None, "", "."):
-            variant = repo_root
-        elif (
-            isinstance(pretrained_model_name_or_path, str)
-            and "/" in pretrained_model_name_or_path
-            and not Path(pretrained_model_name_or_path).exists()
-        ):
-            from huggingface_hub import snapshot_download
-            hub_kwargs = dict(kwargs.pop("hub_kwargs", {}))
-            if subfolder:
-                hub_kwargs.setdefault("allow_patterns", [f"{subfolder}/**"])
-            cache_dir = snapshot_download(pretrained_model_name_or_path, **hub_kwargs)
-            variant = Path(cache_dir) / subfolder if subfolder else Path(cache_dir)
-        else:
-            variant = Path(pretrained_model_name_or_path)
-            if not variant.is_absolute():
-                candidate = (Path.cwd() / variant).resolve()
-                variant = candidate if candidate.exists() else (repo_root / variant).resolve()
-            if subfolder:
-                variant = variant / subfolder
-        id2label_override = kwargs.pop("id2label", None)
-        model_kwargs = dict(kwargs)
-        inserted: List[str] = []
-        def _load_component(folder: str, module_name: str, class_name: str):
-            comp_dir = variant / folder
-            module_path = comp_dir / f"{module_name}.py"
-            has_weights = (comp_dir / "config.json").exists() or (comp_dir / "scheduler_config.json").exists()
-            if not module_path.exists() or not has_weights:
-                return None
-            comp_path = str(comp_dir)
-            if comp_path not in sys.path:
-                sys.path.insert(0, comp_path)
-                inserted.append(comp_path)
-            module = importlib.import_module(module_name)
-            component_cls = getattr(module, class_name)
-            return component_cls.from_pretrained(str(comp_dir), **model_kwargs)
-        try:
-            unet = _load_component("unet", "unet_adm", "ADMUNet2DModel")
-            scheduler = _load_component("scheduler", "scheduling_adm", "ADMScheduler")
-            classifier = _load_component("classifier", "classifier_adm", "ADMClassifierModel")
-            if scheduler is None:
-                sched_dir = variant / "scheduler"
-                if (sched_dir / "scheduling_adm.py").exists():
-                    sched_path = str(sched_dir)
-                    if sched_path not in sys.path:
-                        sys.path.insert(0, sched_path)
-                        inserted.append(sched_path)
-                    scheduler = importlib.import_module("scheduling_adm").ADMScheduler()
-            if unet is None and classifier is None:
-                raise ValueError(f"No loadable components found under {variant}")
-            id2label = id2label_override
-            if id2label is None:
-                model_index_path = variant / "model_index.json"
-                if model_index_path.exists():
-                    id2label = cls._read_id2label_from_model_index(model_index_path)
-            return cls(
-                unet=unet,
-                scheduler=scheduler,
-                classifier=classifier,
-                id2label=id2label,
-            )
-        finally:
-            for comp_path in inserted:
-                if comp_path in sys.path:
-                    sys.path.remove(comp_path)
     def __init__(
         self,
         unet,
-        scheduler,
-        classifier=None,
-        id2label: Optional[Dict[Union[int, str], str]] = None,
-    ):
         super().__init__()
         self.register_modules(unet=unet, scheduler=scheduler, classifier=classifier)
         self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
-        self._id2label = self._normalize_id2label(id2label)
         self.labels = self._build_label2id(self._id2label)
     @staticmethod
-    def _normalize_id2label(id2label: Optional[Dict[Union[int, str], str]]) -> Dict[int, str]:
-        if not id2label:
-            return {}
-        return {int(key): value for key, value in id2label.items()}
-    @staticmethod
-    def _read_id2label_from_model_index(model_index_path: Path) -> Optional[Dict[int, str]]:
-        import json
-        raw = json.loads(model_index_path.read_text(encoding="utf-8"))
-        id2label = raw.get("id2label")
-        if not isinstance(id2label, dict):
-            return None
-        return {int(key): value for key, value in id2label.items()}
-    @staticmethod
-    def _build_label2id(id2label: dict[int, str]) -> dict[str, int]:
-        label2id: dict[str, int] = {}
         for class_id, value in id2label.items():
             for synonym in value.split(","):
                 synonym = synonym.strip()
@@ -212,153 +76,44 @@ class ADMPipeline(DiffusionPipeline):
         return dict(sorted(label2id.items()))
     @property
-    def id2label(self) -> dict[int, str]:
-        """ImageNet class id to English label string (comma-separated synonyms)."""
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
-        r"""
-        Map ImageNet label strings to class ids.
-        Args:
-            label (`str` or `list[str]`):
-                One or more English ImageNet label strings matching a synonym in `id2label`.
-        Returns:
-            `list[int]`: Class ids for [`~ADMPipeline.__call__`].
-        """
-        label2id = self.labels
-        if not label2id:
-            raise ValueError("No English labels loaded. Ensure `id2label` exists in model_index.json.")
-        if isinstance(label, str):
-            label = [label]
-        missing = [item for item in label if item not in label2id]
         if missing:
-            preview = ", ".join(list(label2id.keys())[:8])
-            raise ValueError(
-                f"Unknown English label(s): {missing}. Example valid labels: {preview}, ..."
-            )
-        return [label2id[item] for item in label]
-    @property
-    def do_classifier_guidance(self) -> bool:
-        return self.classifier is not None and getattr(self, "_classifier_guidance_scale", 0.0) > 0
-    def _normalize_class_labels(
-        self,
-        class_labels: Optional[Union[int, str, List[Union[int, str]], torch.Tensor]],
-    ) -> Optional[Union[int, List[int], torch.Tensor]]:
-        if class_labels is None:
-            return None
-        if isinstance(class_labels, str):
-            return self.get_label_ids(class_labels)[0]
-        if isinstance(class_labels, list) and class_labels and isinstance(class_labels[0], str):
-            return self.get_label_ids(class_labels)
-        return class_labels
-    def check_inputs(
-        self,
-        class_labels: Optional[Union[int, str, List[Union[int, str]], torch.Tensor]],
-        height: Optional[int],
-        width: Optional[int],
-    ):
-        if class_labels is None and self.unet.config.class_cond:
-            raise ValueError("`class_labels` are required for class-conditional ADM checkpoints.")
-        if class_labels is not None and self.classifier is None and not self.unet.config.class_cond:
-            raise ValueError(
-                "This checkpoint is unconditional and has no classifier. Load an ADM-G repo with a "
-                "`classifier/` subfolder, or use a class-conditional UNet."
-            )
-        if height is not None and height % 8 != 0:
-            raise ValueError(f"`height` must be divisible by 8 but is {height}.")
-        if width is not None and width % 8 != 0:
-            raise ValueError(f"`width` must be divisible by 8 but is {width}.")
-    def _prepare_class_labels(
-        self,
-        class_labels: Optional[Union[int, List[int], torch.Tensor]],
-        batch_size: int,
-        device: torch.device,
-    ) -> Optional[torch.Tensor]:
-        if class_labels is None:
-            return None
-        if isinstance(class_labels, int):
-            class_labels = [class_labels]
-        if not torch.is_tensor(class_labels):
-            class_labels = torch.tensor(class_labels, device=device, dtype=torch.long)
-        else:
-            class_labels = class_labels.to(device=device, dtype=torch.long)
-        if class_labels.shape[0] != batch_size:
-            raise ValueError(
-                f"`class_labels` batch ({class_labels.shape[0]}) must match requested batch size ({batch_size})."
-            )
-        return class_labels
-    def _get_classifier_grad(
-        self,
-        sample: torch.Tensor,
-        timestep: torch.Tensor,
-        class_labels: torch.Tensor,
-        classifier_scale: float,
-    ) -> torch.Tensor:
-        return self.classifier.guidance_gradient(
-            sample,
-            timestep,
-            class_labels,
-            classifier_scale=classifier_scale,
-        )
-    def prepare_latents(
-        self,
-        batch_size: int,
-        num_channels: int,
-        height: int,
-        width: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """
-        Prepare initial Gaussian noise for pixel-space sampling.
-        Args:
-            batch_size (`int`):
-                Number of images to generate.
-            num_channels (`int`):
-                Number of image channels (typically 3).
-            height (`int`):
-                Image height in pixels.
-            width (`int`):
-                Image width in pixels.
-            dtype (`torch.dtype`):
-                Data type for the latent tensor.
-            device (`torch.device`):
-                Target device.
-            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
-                RNG for deterministic sampling.
-            latents (`torch.Tensor`, *optional*):
-                Pre-generated noise tensor.
-        Returns:
-            `torch.Tensor`:
-                Initial noise of shape `(batch_size, num_channels, height, width)`.
-        """
-        shape = (batch_size, num_channels, height, width)
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device=device, dtype=dtype)
-        return latents
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
@@ -369,142 +124,130 @@ class ADMPipeline(DiffusionPipeline):
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 250,
-        use_ddim: bool = False,
         eta: float = 0.0,
         clip_denoised: bool = True,
-        classifier_guidance_scale: float = 0.0,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.Tensor] = None,
         output_type: str = "pil",
         return_dict: bool = True,
-    ) -> Union[ADMPipelineOutput, Tuple]:
         r"""
-        Generate images with ADM.
-        Args:
-            class_labels (`int`, `str`, `list[int]`, `list[str]`, or `torch.Tensor`, *optional*):
-                ImageNet class indices or English label strings. Required for class-conditional UNets and for ADM-G
-                classifier guidance. Strings are resolved via [`~ADMPipeline.get_label_ids`].
-            batch_size (`int`, *optional*, defaults to 1):
-                Number of images to generate when `class_labels` is not provided.
-            height (`int`, *optional*):
-                Height in pixels. Defaults to `unet.config.image_size`.
-            width (`int`, *optional*):
-                Width in pixels. Defaults to `unet.config.image_size`.
-            num_inference_steps (`int`, *optional*, defaults to 250):
-                Number of denoising steps.
-            use_ddim (`bool`, *optional*, defaults to `False`):
-                Use DDIM sampling instead of DDPM.
-            eta (`float`, *optional*, defaults to 0.0):
-                DDIM stochasticity parameter. Only used when `use_ddim=True`.
-            clip_denoised (`bool`, *optional*, defaults to `True`):
-                Clamp predicted `x_0` to `[-1, 1]` inside the scheduler.
-            classifier_guidance_scale (`float`, *optional*, defaults to 0.0):
-                ADM-G guidance strength. Values `> 0` require a loaded `classifier` (OpenAI `classifier_scale`).
-            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
-                RNG for reproducible generation.
-            latents (`torch.Tensor`, *optional*):
-                Pre-generated initial noise.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                Output format: `"pil"`, `"np"`, or `"pt"`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Return an [`ADMPipelineOutput`] instead of a tuple.
         Examples:
-        Returns:
-            [`ADMPipelineOutput`] or `tuple`:
-                Generated images.
         """
-        if height is None:
-            height = int(self.unet.config.image_size)
-        if width is None:
-            width = int(self.unet.config.image_size)
-        class_labels = self._normalize_class_labels(class_labels)
-        self.check_inputs(class_labels, height, width)
-        if classifier_guidance_scale > 0 and self.classifier is None:
-            raise ValueError("`classifier_guidance_scale > 0` requires a loaded `classifier` (ADM-G checkpoint).")
-        if classifier_guidance_scale > 0 and class_labels is None:
-            raise ValueError("`class_labels` are required when using classifier guidance.")
-        self._classifier_guidance_scale = classifier_guidance_scale
         device = self._execution_device
-        model_dtype = self.unet.dtype
         if class_labels is not None:
-            if isinstance(class_labels, int):
-                batch_size = 1
-            elif isinstance(class_labels, list):
-                batch_size = len(class_labels)
-            elif torch.is_tensor(class_labels):
-                batch_size = class_labels.shape[0]
-        class_labels = self._prepare_class_labels(class_labels, batch_size, device)
-        latents = self.prepare_latents(
-            batch_size,
-            3,
-            height,
-            width,
-            model_dtype,
-            device,
-            generator,
-            latents,
-        )
-        self.scheduler.set_timesteps(num_inference_steps, device=device, use_ddim=use_ddim)
-        self.scheduler._eta = eta
-        self._num_timesteps = len(self.scheduler.timesteps)
-        unet_class_labels = class_labels if self.unet.config.class_cond else None
-        for t in tqdm(self.scheduler.timesteps, desc="Denoising"):
-            timestep = torch.full((batch_size,), t, device=device, dtype=torch.long)
-            model_timesteps = self.scheduler.scale_timesteps_for_model(timestep)
-            model_output = self.unet(
-                latents,
-                model_timesteps,
-                class_labels=unet_class_labels,
-                return_dict=True,
-            ).sample
             cond_grad = None
-            if self.do_classifier_guidance:
-                cond_grad = self._get_classifier_grad(
-                    latents,
-                    timestep,
-                    class_labels,
-                    classifier_guidance_scale,
                 )
-            latents = self.scheduler.step(
-                model_output,
-                t,
-                latents,
-                generator=generator,
-                clip_denoised=clip_denoised,
-                eta=eta,
-                cond_grad=cond_grad,
-            ).prev_sample
-        image = latents
-        has_nsfw_concept = None
-        if output_type == "latent":
-            image = latents
-        elif output_type == "pt":
-            image = (image / 2 + 0.5).clamp(0, 1)
-        elif output_type in ("pil", "np"):
-            image = (image / 2 + 0.5).clamp(0, 1)
             image = self.image_processor.postprocess(image, output_type=output_type)
         self.maybe_free_model_hooks()
         if not return_dict:
-            return (image, has_nsfw_concept)
-        return ADMPipelineOutput(images=image)

 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 import torch
 from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
+        >>> from pathlib import Path
         >>> import torch
         >>> from diffusers import DiffusionPipeline
+        >>> model_dir = Path("path/to/BiliSakura/ADM-diffusers/ADM-G-512")
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     str(model_dir),
+        ...     local_files_only=True,
+        ...     custom_pipeline=str(model_dir / "pipeline.py"),
+        ...     torch_dtype=torch.bfloat16,
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> class_id = pipe.get_label_ids("golden retriever")[0]
+        >>> image = pipe(class_labels=class_id, guidance_scale=4.0).images[0]
         ```
 """
 class ADMPipeline(DiffusionPipeline):
+    r"""ADM/ADM-G pipeline compatible with Diffusers custom pipeline loading."""
     model_cpu_offload_seq = "classifier->unet"
     _optional_components = ["classifier"]
     def __init__(
         self,
         unet,
+        scheduler: KarrasDiffusionSchedulers,
+        classifier: Optional[Any] = None,
+        id2label: Optional[Dict[str, str]] = None,
+        null_class_id: int = 1000,
+    ) -> None:
         super().__init__()
         self.register_modules(unet=unet, scheduler=scheduler, classifier=classifier)
+        self.register_to_config(null_class_id=int(null_class_id))
         self.image_processor = VaeImageProcessor(vae_scale_factor=1, do_normalize=False)
+        self._id2label = {int(k): v for k, v in (id2label or {}).items()}
         self.labels = self._build_label2id(self._id2label)
     @staticmethod
+    def _build_label2id(id2label: Dict[int, str]) -> Dict[str, int]:
+        label2id: Dict[str, int] = {}
         for class_id, value in id2label.items():
             for synonym in value.split(","):
                 synonym = synonym.strip()
         return dict(sorted(label2id.items()))
     @property
+    def id2label(self) -> Dict[int, str]:
         return self._id2label
     def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        if not self.labels:
+            raise ValueError("No id2label mapping is available in this checkpoint.")
+        labels = [label] if isinstance(label, str) else label
+        missing = [item for item in labels if item not in self.labels]
         if missing:
+            preview = ", ".join(list(self.labels.keys())[:8])
+            raise ValueError(f"Unknown labels: {missing}. Example valid labels: {preview}, ...")
+        return [self.labels[item] for item in labels]
+    @staticmethod
+    def prepare_extra_step_kwargs(
+        scheduler: KarrasDiffusionSchedulers,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+        eta: float,
+    ) -> Dict[str, Any]:
+        kwargs: Dict[str, Any] = {}
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        if "eta" in step_params:
+            kwargs["eta"] = eta
+        if "generator" in step_params:
+            kwargs["generator"] = generator
+        return kwargs
+    @staticmethod
+    def _is_ddim_like(step_params: Set[str]) -> bool:
+        return "eta" in step_params
+    @staticmethod
+    def _expand_timestep(timestep, batch: int, device: torch.device) -> torch.Tensor:
+        if not torch.is_tensor(timestep):
+            timestep = torch.tensor([timestep], dtype=torch.long, device=device)
+        elif timestep.ndim == 0:
+            timestep = timestep[None].to(device=device)
+        return timestep.expand(batch)
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 250,
+        guidance_scale: float = 4.0,
+        classifier_guidance_scale: float = 0.0,
         eta: float = 0.0,
         clip_denoised: bool = True,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.Tensor] = None,
         output_type: str = "pil",
         return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
+        Generate samples from the ADM/ADM-G checkpoint.
         Examples:
+            <!-- this section is replaced by replace_example_docstring -->
         """
+        # Stage 1: check inputs
+        if isinstance(class_labels, str):
+            class_labels = self.get_label_ids(class_labels)[0]
+        if isinstance(class_labels, list) and class_labels and isinstance(class_labels[0], str):
+            class_labels = self.get_label_ids(class_labels)
+        native_size = int(getattr(self.unet.config, "image_size", 256))
+        height = native_size if height is None else int(height)
+        width = native_size if width is None else int(width)
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"height and width must be divisible by 8, got ({height}, {width}).")
+        if output_type not in {"pil", "np", "pt", "latent"}:
+            raise ValueError(f"Unsupported output_type: {output_type}")
+        # This checkpoint does not use classifier-free guidance (CFG).
+        # Keep classifier_guidance_scale for compatibility, but treat guidance_scale
+        # as the primary classifier-guidance strength.
+        effective_classifier_guidance_scale = (
+            float(classifier_guidance_scale) if classifier_guidance_scale > 0 else float(guidance_scale)
+        )
+        if class_labels is None and (self.unet.config.class_cond or effective_classifier_guidance_scale > 0):
+            raise ValueError("class_labels are required for class-conditional sampling and ADM-G guidance.")
+        if isinstance(class_labels, int):
+            batch_size = 1
+            class_labels = [class_labels]
+        elif isinstance(class_labels, list):
+            batch_size = len(class_labels)
+        elif torch.is_tensor(class_labels):
+            batch_size = int(class_labels.shape[0])
+        # Stage 2: define call parameters
         device = self._execution_device
+        channels = int(getattr(self.unet.config, "in_channels", 3))
+        dtype = self.unet.dtype
+        # Stage 3: prepare class conditioning
+        class_tensor = None
+        class_input = None
         if class_labels is not None:
+            class_tensor = class_labels if torch.is_tensor(class_labels) else torch.tensor(class_labels, dtype=torch.long)
+            class_tensor = class_tensor.to(device=device, dtype=torch.long).reshape(-1)
+            if class_tensor.shape[0] != batch_size:
+                raise ValueError("class_labels batch must match requested batch_size")
+            if self.unet.config.class_cond:
+                class_input = class_tensor
+        # Stage 4: prepare timesteps
+        scheduler = self.scheduler
+        step_params = set(inspect.signature(scheduler.step).parameters.keys())
+        scheduler.set_timesteps(num_inference_steps, device=device)
+        # Stage 5: prepare latent variables
+        shape = (batch_size, channels, height, width)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if tuple(latents.shape) != shape:
+                raise ValueError(f"Unexpected latents shape {tuple(latents.shape)}; expected {shape}.")
+            latents = latents.to(device=device, dtype=dtype)
+        latents = latents * scheduler.init_noise_sigma
+        # Stage 6: prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(scheduler, generator, eta)
+        # Stage 7: denoising loop
+        for timestep in self.progress_bar(scheduler.timesteps):
+            model_input = latents
+            model_input = scheduler.scale_model_input(model_input, timestep)
+            timestep_input = self._expand_timestep(timestep, model_input.shape[0], model_input.device)
+            model_output = self.unet(model_input, timestep_input, class_labels=class_input, return_dict=True).sample
             cond_grad = None
+            if effective_classifier_guidance_scale > 0:
+                if self.classifier is None or class_tensor is None:
+                    raise ValueError("guidance_scale requires both classifier and class_labels.")
+                grad_t = self._expand_timestep(timestep, batch_size, latents.device)
+                cond_grad = self.classifier.guidance_gradient(
+                    latents, grad_t, class_tensor, classifier_scale=effective_classifier_guidance_scale
                 )
+            step_model_output = model_output
+            if cond_grad is not None:
+                if self._is_ddim_like(step_params):
+                    eps = model_output[:, :channels] if model_output.shape[1] == 2 * channels else model_output
+                    alpha_bar_t = scheduler.alphas_cumprod[timestep].to(device=latents.device, dtype=latents.dtype)
+                    step_model_output = eps - (1 - alpha_bar_t).sqrt() * cond_grad
+                elif hasattr(scheduler, "_get_variance"):
+                    pred_var = None
+                    if model_output.shape[1] == 2 * channels:
+                        _, pred_var = torch.split(model_output, channels, dim=1)
+                    variance = scheduler._get_variance(int(timestep), predicted_variance=pred_var)
+                    if scheduler.config.variance_type == "learned_range":
+                        variance = torch.exp(variance)
+                    latents = latents + variance * cond_grad
+                else:
+                    raise ValueError(
+                        "guidance_scale is not supported for the current scheduler. "
+                        "Use a DDPM/DDIM-compatible scheduler or disable classifier guidance."
+                    )
+            latents = scheduler.step(step_model_output, timestep, latents, return_dict=True, **extra_step_kwargs).prev_sample
+        image = latents if output_type == "latent" else (latents / 2 + 0.5).clamp(0, 1)
+        if output_type in {"pil", "np"}:
             image = self.image_processor.postprocess(image, output_type=output_type)
         self.maybe_free_model_hooks()
         if not return_dict:
+            return (image,)
+        return ImagePipelineOutput(images=image)

ADM-G-512/scheduler/scheduler_config.json CHANGED Viewed

@@ -1,11 +1,12 @@
 {
-  "_class_name": "ADMScheduler",
   "_diffusers_version": "0.36.0",
-  "learn_sigma": true,
-  "noise_schedule": "linear",
-  "predict_xstart": false,
-  "rescale_timesteps": false,
-  "sigma_small": false,
-  "steps": 1000,
-  "timestep_respacing": ""
 }

 {
+  "_class_name": "DDPMScheduler",
   "_diffusers_version": "0.36.0",
+  "num_train_timesteps": 1000,
+  "beta_start": 0.0001,
+  "beta_end": 0.02,
+  "beta_schedule": "linear",
+  "prediction_type": "epsilon",
+  "variance_type": "learned_range",
+  "clip_sample": true,
+  "timestep_spacing": "leading"
 }

ADM-G-512/unet/modeling_adm.py CHANGED Viewed

@@ -37,7 +37,10 @@ def avg_pool_nd(dims: int, *args, **kwargs):
 class GroupNorm32(nn.GroupNorm):
     def forward(self, x):
-        return super().forward(x.float()).type(x.dtype)
 def normalization(channels: int):
@@ -475,19 +478,20 @@ class EncoderUNetModel(nn.Module):
         self.middle_block.apply(convert_module_to_f32)
     def forward(self, x, timesteps):
-        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
         results = []
-        h = x.type(self.dtype)
         for module in self.input_blocks:
             h = module(h, emb)
             if self.pool.startswith("spatial"):
-                results.append(h.type(x.dtype).mean(dim=(2, 3)))
         h = self.middle_block(h, emb)
         if self.pool.startswith("spatial"):
-            results.append(h.type(x.dtype).mean(dim=(2, 3)))
             h = torch.cat(results, dim=-1)
             return self.out(h)
-        h = h.type(x.dtype)
         return self.out(h)
@@ -673,12 +677,13 @@ class UNetModel(nn.Module):
     def forward(self, x, timesteps, y: Optional[torch.Tensor] = None):
         assert (y is not None) == (self.num_classes is not None)
         hs = []
-        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
         if self.num_classes is not None:
             assert y.shape == (x.shape[0],)
             emb = emb + self.label_emb(y)
-        h = x.type(self.dtype)
         for module in self.input_blocks:
             h = module(h, emb)
             hs.append(h)
@@ -686,7 +691,7 @@ class UNetModel(nn.Module):
         for module in self.output_blocks:
             h = torch.cat([h, hs.pop()], dim=1)
             h = module(h, emb)
-        h = h.type(x.dtype)
         return self.out(h)

 class GroupNorm32(nn.GroupNorm):
     def forward(self, x):
+        weight = self.weight.float() if self.weight is not None else None
+        bias = self.bias.float() if self.bias is not None else None
+        y = F.group_norm(x.float(), self.num_groups, weight, bias, self.eps)
+        return y.to(dtype=x.dtype)
 def normalization(channels: int):
         self.middle_block.apply(convert_module_to_f32)
     def forward(self, x, timesteps):
+        emb = timestep_embedding(timesteps, self.model_channels).to(dtype=self.time_embed[0].weight.dtype)
+        emb = self.time_embed(emb)
         results = []
+        h = x.to(dtype=self.time_embed[0].weight.dtype)
         for module in self.input_blocks:
             h = module(h, emb)
             if self.pool.startswith("spatial"):
+                results.append(h.to(dtype=self.time_embed[0].weight.dtype).mean(dim=(2, 3)))
         h = self.middle_block(h, emb)
         if self.pool.startswith("spatial"):
+            results.append(h.to(dtype=self.time_embed[0].weight.dtype).mean(dim=(2, 3)))
             h = torch.cat(results, dim=-1)
             return self.out(h)
+        h = h.to(dtype=self.time_embed[0].weight.dtype)
         return self.out(h)
     def forward(self, x, timesteps, y: Optional[torch.Tensor] = None):
         assert (y is not None) == (self.num_classes is not None)
         hs = []
+        emb = timestep_embedding(timesteps, self.model_channels).to(dtype=self.time_embed[0].weight.dtype)
+        emb = self.time_embed(emb)
         if self.num_classes is not None:
             assert y.shape == (x.shape[0],)
             emb = emb + self.label_emb(y)
+        h = x.to(dtype=self.time_embed[0].weight.dtype)
         for module in self.input_blocks:
             h = module(h, emb)
             hs.append(h)
         for module in self.output_blocks:
             h = torch.cat([h, hs.pop()], dim=1)
             h = module(h, emb)
+        h = h.to(dtype=self.time_embed[0].weight.dtype)
         return self.out(h)

ADM-G-512/unet/unet_adm.py CHANGED Viewed

@@ -12,7 +12,12 @@ from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
-from modeling_adm import create_adm_unet_model
 @dataclass

 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import BaseOutput
+try:
+    from .modeling_adm import create_adm_unet_model
+except ImportError:
+    import importlib
+    create_adm_unet_model = importlib.import_module("modeling_adm").create_adm_unet_model
 @dataclass

README.md CHANGED Viewed

@@ -28,7 +28,7 @@ This Hugging Face repo hosts **multiple self-contained checkpoints as subfolders
 ## Available checkpoints
-| Subfolder | Resolution | Classifier scale | OpenAI sources |
 | --- | --- | ---: | --- |
 | [`ADM-G-256/`](ADM-G-256/) | 256×256 | 1.0 | `256x256_diffusion.pt` + `256x256_classifier.pt` |
 | [`ADM-G-512/`](ADM-G-512/) | 512×512 | 4.0 | `512x512_diffusion.pt` + `512x512_classifier.pt` |
@@ -50,52 +50,33 @@ Chinese labels are still preserved in the main source repo under `src/labels/id2
 ![ADM-G-512 demo](ADM-G-512/demo.png)
-## Load from Hugging Face
 ```python
-import sys
 from pathlib import Path
 import torch
-from huggingface_hub import snapshot_download
-repo_dir = Path(snapshot_download("BiliSakura/ADM-diffusers"))
-variant = "ADM-G-512"  # or "ADM-G-256"
-sys.path.insert(0, str(repo_dir / variant))
-from pipeline import ADMPipeline
-pipe = ADMPipeline.from_pretrained(".")
-pipe.to("cuda")
-pipe.unet.float()
-pipe.classifier.float()
-pipe.classifier.model.dtype = torch.float32
-images = pipe(
-    class_labels=207,
-    num_inference_steps=250,
-    classifier_guidance_scale=4.0 if variant == "ADM-G-512" else 1.0,
-).images
-# Human-readable ImageNet labels (English)
-print(pipe.id2label[207])          # "golden retriever"
-pipe.get_label_ids("golden retriever")  # [207]
-images = pipe(class_labels="golden retriever", classifier_guidance_scale=1.0).images
-```
-## Load from a local clone
-```python
-import sys
-from pathlib import Path
-repo = Path("BiliSakura/ADM-diffusers").resolve()
-variant = "ADM-G-256"
-sys.path.insert(0, str(repo / variant))
-from pipeline import ADMPipeline
-pipe = ADMPipeline.from_pretrained(".")
-pipe.to("cuda")
 ```
 ## Repo layout

 ## Available checkpoints
+| Subfolder | Resolution | Guidance scale | OpenAI sources |
 | --- | --- | ---: | --- |
 | [`ADM-G-256/`](ADM-G-256/) | 256×256 | 1.0 | `256x256_diffusion.pt` + `256x256_classifier.pt` |
 | [`ADM-G-512/`](ADM-G-512/) | 512×512 | 4.0 | `512x512_diffusion.pt` + `512x512_classifier.pt` |
 ![ADM-G-512 demo](ADM-G-512/demo.png)
+Settings used for this demo image: `ADM-G-512`, `DDIMScheduler`, `num_inference_steps=50`, `guidance_scale=4.0`, `seed=42`, class `"golden retriever"`.
 ```python
 from pathlib import Path
 import torch
+from diffusers import DDIMScheduler, DiffusionPipeline
+model_dir = Path("./BiliSakura/ADM-diffusers/ADM-G-512")
+pipe = DiffusionPipeline.from_pretrained(
+    str(model_dir),
+    local_files_only=True,
+    custom_pipeline=str(model_dir / "pipeline.py"),
+    torch_dtype=torch.bfloat16,
+)
+pipe = pipe.to("cuda")
+pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+class_id = pipe.get_label_ids("golden retriever")[0]
+generator = torch.Generator(device="cuda").manual_seed(42)
+out = pipe(
+    class_labels=class_id,
+    guidance_scale=4.0,
+    num_inference_steps=50,
+    generator=generator,
+).images[0]
+out
 ```
 ## Repo layout