BiliSakura commited on 6 days ago

Commit

2ccd4c6

verified ·

1 Parent(s): 4ad5f15

Upload folder using huggingface_hub

Browse files

Files changed (22) hide show

README.md +87 -3
config.json +52 -0
controlnet/config.json +51 -0
controlnet/controlnet.py +238 -0
controlnet/diffusion_pytorch_model.safetensors +3 -0
feature_extractor/preprocessor_config.json +28 -0
model_index.json +37 -0
pipeline_diffusionsat.py +303 -0
pipeline_diffusionsat_controlnet.py +425 -0
scheduler/scheduler_config.json +20 -0
text_encoder/config.json +25 -0
text_encoder/model.safetensors +3 -0
tokenizer/config.json +52 -0
tokenizer/merges.txt +0 -0
tokenizer/special_tokens_map.json +24 -0
tokenizer/tokenizer_config.json +33 -0
tokenizer/vocab.json +0 -0
unet/config.json +56 -0
unet/diffusion_pytorch_model.safetensors +3 -0
unet/sat_unet.py +265 -0
vae/config.json +31 -0
vae/diffusion_pytorch_model.safetensors +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,87 @@
----
-license: apache-2.0
----

+# DiffusionSat Custom Pipelines
+Custom community pipelines for loading DiffusionSat checkpoints directly with `diffusers.DiffusionPipeline.from_pretrained()`.
+> See [Diffusers Community Pipeline Documentation](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview)
+## Available Pipelines
+This directory contains two custom pipelines:
+1. **`pipeline_diffusionsat.py`**: Standard text-to-image pipeline with DiffusionSat metadata support.
+2. **`pipeline_diffusionsat_controlnet.py`**: ControlNet pipeline with DiffusionSat metadata and conditional metadata support.
+## Setup
+The checkpoint folder (`ckpt/diffusionsat/`) should contain the standard diffusers components (unet, vae, scheduler, etc.). You can reference these pipeline files directly from this directory or copy them to your checkpoint folder.
+## Usage
+### 1. Text-to-Image Pipeline
+Use `pipeline_diffusionsat.py` for standard generation.
+```python
+import torch
+from diffusers import DiffusionPipeline
+# Load pipeline
+pipe = DiffusionPipeline.from_pretrained(
+    "path/to/ckpt/diffusionsat",
+    custom_pipeline="./custom_pipelines/pipeline_diffusionsat.py",  # Path to this file
+    torch_dtype=torch.float16,
+    trust_remote_code=True,
+)
+pipe = pipe.to("cuda")
+# Optional: Metadata (normalized lat, lon, timestamp, GSD, etc.)
+# metadata = [0.5, -0.3, 0.7, 0.2, 0.1, 0.0, 0.5]
+# Generate
+image = pipe(
+    "satellite image of farmland",
+    metadata=None,  # Optional
+    num_inference_steps=30,
+).images[0]
+```
+### 2. ControlNet Pipeline
+Use `pipeline_diffusionsat_controlnet.py` for ControlNet generation.
+```python
+import torch
+from diffusers import DiffusionPipeline, ControlNetModel
+from diffusers.utils import load_image
+# 1. Load ControlNet
+controlnet = ControlNetModel.from_pretrained(
+    "path/to/ckpt/diffusionsat/controlnet",
+    torch_dtype=torch.float16
+)
+# 2. Load Pipeline with ControlNet
+pipe = DiffusionPipeline.from_pretrained(
+    "path/to/ckpt/diffusionsat",
+    controlnet=controlnet,
+    custom_pipeline="./custom_pipelines/pipeline_diffusionsat_controlnet.py", # Path to this file
+    torch_dtype=torch.float16,
+    trust_remote_code=True,
+)
+pipe = pipe.to("cuda")
+# 3. Prepare Control Image
+control_image = load_image("path/to/conditioning_image.png")
+# 4. Generate
+# metadata: Target image metadata (optional)
+# cond_metadata: Conditioning image metadata (optional)
+image = pipe(
+    "satellite image of farmland",
+    image=control_image,
+    metadata=None,
+    cond_metadata=None,
+    num_inference_steps=30,
+).images[0]
+```

config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "_class_name": "ControlNetModel",
+  "_diffusers_version": "0.17.0",
+  "_name_or_path": "/data/jiabo/diffusionsat/testoutput/checkpoint-1",
+  "act_fn": "silu",
+  "attention_head_dim": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "class_embed_type": null,
+  "conditioning_embedding_out_channels": [
+    16,
+    32,
+    96,
+    256
+  ],
+  "conditioning_in_channels": 3,
+  "conditioning_scale": 1,
+  "controlnet_conditioning_channel_order": "rgb",
+  "cross_attention_dim": 1024,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "global_pool_conditions": false,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "num_metadata": 7,
+  "only_cross_attention": false,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "upcast_attention": true,
+  "use_linear_projection": true,
+  "use_metadata": true
+}

controlnet/config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "_class_name": ["controlnet", "ControlNetModel"],
+  "_diffusers_version": "0.17.0",
+  "act_fn": "silu",
+  "attention_head_dim": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "class_embed_type": null,
+  "conditioning_embedding_out_channels": [
+    16,
+    32,
+    96,
+    256
+  ],
+  "conditioning_in_channels": 3,
+  "conditioning_scale": 1,
+  "controlnet_conditioning_channel_order": "rgb",
+  "cross_attention_dim": 1024,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "global_pool_conditions": false,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "num_metadata": 7,
+  "only_cross_attention": false,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "upcast_attention": true,
+  "use_linear_projection": true,
+  "use_metadata": true
+}

controlnet/controlnet.py ADDED Viewed

	@@ -0,0 +1,238 @@

+"""ControlNet wrapper that reuses diffusers implementation and adds metadata."""
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+from diffusers.models.controlnets.controlnet import (
+    ControlNetConditioningEmbedding as HFConditioningEmbedding,
+    ControlNetModel as HFControlNetModel,
+    ControlNetOutput,
+    zero_module,
+)
+from diffusers.utils import logging
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class ControlNetConditioningEmbedding(HFConditioningEmbedding):
+    """Adapter to allow variable downsample stride via `scale` while reusing upstream layers."""
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
+        scale: int = 1,
+    ):
+        # Initialize base, then optionally override blocks to respect custom stride.
+        super().__init__(
+            conditioning_embedding_channels=conditioning_embedding_channels,
+            conditioning_channels=conditioning_channels,
+            block_out_channels=block_out_channels,
+        )
+        if scale != 1:
+            blocks = nn.ModuleList([])
+            current_scale = scale
+            for i in range(len(block_out_channels) - 1):
+                channel_in = block_out_channels[i]
+                channel_out = block_out_channels[i + 1]
+                blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
+                stride = 2 if current_scale < 8 else 1
+                blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=stride))
+                if current_scale != 8:
+                    current_scale = int(current_scale * 2)
+            self.blocks = blocks
+class ControlNetModel(HFControlNetModel):
+    """Thin wrapper around `diffusers.ControlNetModel` with metadata embeddings."""
+    def __init__(
+        self,
+        *args,
+        conditioning_in_channels: int = 3,
+        conditioning_scale: int = 1,
+        use_metadata: bool = True,
+        num_metadata: int = 7,
+        **kwargs,
+    ):
+        # Map alias to upstream argument.
+        kwargs.setdefault("conditioning_channels", conditioning_in_channels)
+        super().__init__(*args, **kwargs)
+        # Track custom config entries for save/load parity.
+        self.register_to_config(
+            use_metadata=use_metadata, num_metadata=num_metadata, conditioning_scale=conditioning_scale
+        )
+        self.use_metadata = use_metadata
+        self.num_metadata = num_metadata
+        if use_metadata:
+            timestep_input_dim = self.time_embedding.linear_1.in_features
+            time_embed_dim = self.time_embedding.linear_2.out_features
+            self.metadata_embedding = nn.ModuleList(
+                [
+                    self._build_metadata_embedding(timestep_input_dim, time_embed_dim)
+                    for _ in range(num_metadata)
+                ]
+            )
+        else:
+            self.metadata_embedding = None
+        # Optionally replace conditioning embedding to honor `conditioning_scale` stride tweaks.
+        if conditioning_scale != 1:
+            self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
+                conditioning_embedding_channels=self.controlnet_cond_embedding.conv_out.out_channels,
+                conditioning_channels=conditioning_in_channels,
+                block_out_channels=tuple(
+                    layer.out_channels for layer in self.controlnet_cond_embedding.blocks[1::2]
+                ),
+                scale=conditioning_scale,
+            )
+    @staticmethod
+    def _build_metadata_embedding(timestep_input_dim: int, time_embed_dim: int) -> nn.Module:
+        from diffusers.models.embeddings import TimestepEmbedding
+        return TimestepEmbedding(timestep_input_dim, time_embed_dim)
+    def _encode_metadata(
+        self, metadata: Optional[torch.Tensor], dtype: torch.dtype
+    ) -> Optional[torch.Tensor]:
+        if self.metadata_embedding is None:
+            return None
+        if metadata is None:
+            raise ValueError("metadata must be provided when use_metadata=True")
+        if metadata.dim() != 2 or metadata.shape[1] != self.num_metadata:
+            raise ValueError(f"Invalid metadata shape {metadata.shape}, expected (batch, {self.num_metadata})")
+        md_bsz = metadata.shape[0]
+        projected = self.time_proj(metadata.view(-1)).view(md_bsz, self.num_metadata, -1).to(dtype=dtype)
+        md_emb = projected.new_zeros((md_bsz, projected.shape[-1]))
+        for idx, md_embed in enumerate(self.metadata_embedding):
+            md_emb = md_emb + md_embed(projected[:, idx, :])
+        return md_emb
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond: torch.Tensor,
+        conditioning_scale: float = 1.0,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        metadata: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]:
+        # Start from upstream logic, inserting metadata into the timestep embeddings.
+        channel_order = self.config.controlnet_conditioning_channel_order
+        if channel_order == "bgr":
+            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+        elif channel_order != "rgb":
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            is_mps = sample.device.type == "mps"
+            is_npu = sample.device.type == "npu"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+            else:
+                dtype = torch.int32 if (is_mps or is_npu) else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps).to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
+        if class_emb is not None:
+            if self.config.class_embed_type == "timestep":
+                class_emb = class_emb.to(dtype=sample.dtype)
+            emb = emb + class_emb
+        aug_emb = self.get_aug_embed(
+            emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs or {}
+        )
+        if aug_emb is not None:
+            emb = emb + aug_emb
+        md_emb = self._encode_metadata(metadata=metadata, dtype=sample.dtype)
+        if md_emb is not None:
+            emb = emb + md_emb
+        sample = self.conv_in(sample)
+        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+        sample = sample + controlnet_cond
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+        controlnet_down_block_res_samples = ()
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
+        down_block_res_samples = controlnet_down_block_res_samples
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+        if guess_mode and not self.config.global_pool_conditions:
+            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device) * conditioning_scale
+            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
+            mid_block_res_sample = mid_block_res_sample * scales[-1]
+        else:
+            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+            mid_block_res_sample = mid_block_res_sample * conditioning_scale
+        if self.config.global_pool_conditions:
+            down_block_res_samples = [
+                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
+            ]
+            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+        return ControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )

controlnet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bd5f6b9aea04714f331cd94d721c8adb8b378a2774a9805e6f0a369e33aacd7
+size 1514372328

feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

model_index.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_class_name": ["pipeline_diffusionsat_controlnet", "DiffusionSatControlNetPipeline"],
+  "_diffusers_version": "0.17.0",
+  "controlnet": [
+    "controlnet",
+    "ControlNetModel"
+  ],
+  "feature_extractor": [
+    "transformers",
+    "CLIPImageProcessor"
+  ],
+  "requires_safety_checker": false,
+  "safety_checker": [
+    null,
+    null
+  ],
+  "scheduler": [
+    "diffusers",
+    "DDIMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "unet": [
+    "sat_unet",
+    "SatUNet"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

pipeline_diffusionsat.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""
+Self-contained DiffusionSat text-to-image pipeline that can be loaded directly
+from the checkpoint folder without importing the project package.
+"""
+from __future__ import annotations
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+from packaging import version
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    deprecate,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+    is_accelerate_available,
+)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    StableDiffusionPipeline as DiffusersStableDiffusionPipeline,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+        >>> pipe = DiffusionPipeline.from_pretrained("path/to/ckpt/diffusionsat", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+class DiffusionSatPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using the DiffusionSat UNet with optional metadata.
+    """
+    _optional_components = ["safety_checker", "feature_extractor"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: Any,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+    # Borrow helper implementations from diffusers' StableDiffusionPipeline for convenience.
+    enable_vae_slicing = DiffusersStableDiffusionPipeline.enable_vae_slicing
+    disable_vae_slicing = DiffusersStableDiffusionPipeline.disable_vae_slicing
+    enable_sequential_cpu_offload = DiffusersStableDiffusionPipeline.enable_sequential_cpu_offload
+    _execution_device = DiffusersStableDiffusionPipeline._execution_device
+    _encode_prompt = DiffusersStableDiffusionPipeline._encode_prompt
+    run_safety_checker = DiffusersStableDiffusionPipeline.run_safety_checker
+    decode_latents = DiffusersStableDiffusionPipeline.decode_latents
+    prepare_extra_step_kwargs = DiffusersStableDiffusionPipeline.prepare_extra_step_kwargs
+    check_inputs = DiffusersStableDiffusionPipeline.check_inputs
+    prepare_latents = DiffusersStableDiffusionPipeline.prepare_latents
+    def prepare_metadata(
+        self, batch_size, metadata, do_classifier_free_guidance, device, dtype,
+    ):
+        has_metadata = getattr(self.unet.config, "use_metadata", False)
+        num_metadata = getattr(self.unet.config, "num_metadata", 0)
+        if metadata is None and has_metadata and num_metadata > 0:
+            metadata = torch.zeros((batch_size, num_metadata), device=device, dtype=dtype)
+        if metadata is None:
+            return None
+        md = torch.tensor(metadata) if not torch.is_tensor(metadata) else metadata
+        if len(md.shape) == 1:
+            md = md.unsqueeze(0).expand(batch_size, -1)
+        md = md.to(device=device, dtype=dtype)
+        if do_classifier_free_guidance:
+            md = torch.cat([torch.zeros_like(md), md])
+        return md
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        metadata: Optional[List[float]] = None,
+    ):
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels if hasattr(self.unet, "in_channels") else self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 6.5: Prepare metadata (auto-zero filled when missing)
+        input_metadata = self.prepare_metadata(
+            batch_size, metadata, do_classifier_free_guidance, device, prompt_embeds.dtype
+        )
+        if input_metadata is not None:
+            assert input_metadata.shape[-1] == getattr(self.unet.config, "num_metadata", input_metadata.shape[-1])
+            assert input_metadata.shape[0] == prompt_embeds.shape[0]
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    metadata=input_metadata,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            image = self.decode_latents(latents)
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+            image = self.numpy_to_pil(image)
+        else:
+            image = self.decode_latents(latents)
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+__all__ = ["DiffusionSatPipeline"]

pipeline_diffusionsat_controlnet.py ADDED Viewed

	@@ -0,0 +1,425 @@

+"""
+Self-contained DiffusionSat ControlNet pipeline that can be loaded directly from
+the checkpoint folder without importing the project package.
+"""
+from __future__ import annotations
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import einops
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from diffusers.loaders import TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    PIL_INTERPOLATION,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+    is_accelerate_available,
+    is_accelerate_version,
+)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
+    StableDiffusionPipeline as DiffusersStableDiffusionPipeline,
+)
+from diffusers.pipelines.controlnet.pipeline_controlnet import (
+    StableDiffusionControlNetPipeline as DiffusersControlNetPipeline,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import DiffusionPipeline
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+        >>> import cv2
+        >>> from PIL import Image
+        >>>
+        >>> image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
+        >>> image = np.array(image)
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+        >>>
+        >>> pipe = DiffusionPipeline.from_pretrained("path/to/ckpt/diffusionsat", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+        >>> pipe.enable_xformers_memory_efficient_attention()
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking woman", num_inference_steps=20, generator=generator, image=canny_image
+        ... ).images[0]
+        ```
+"""
+class DiffusionSatControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
+    """
+    ControlNet-aware pipeline for DiffusionSat. This is a mostly direct copy of
+    the project pipeline to avoid importing the `diffusionsat` package when
+    loading from the checkpoint folder. Minimal tweaks:
+    - auto-fills metadata/cond_metadata with zeros when the model expects them.
+    """
+    _optional_components = ["safety_checker", "feature_extractor"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: Any,
+        controlnet: Any,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results."
+            )
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        # Support MultiControlNetModel-like objects without importing the project module.
+        if isinstance(controlnet, (list, tuple)):
+            # defer to diffusers' MultiControlNetModel if available
+            from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+            controlnet = MultiControlNetModel(controlnet)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+    # Reuse helpers from diffusers baseline pipelines.
+    enable_vae_slicing = DiffusersStableDiffusionPipeline.enable_vae_slicing
+    disable_vae_slicing = DiffusersStableDiffusionPipeline.disable_vae_slicing
+    enable_vae_tiling = DiffusersStableDiffusionPipeline.enable_vae_tiling
+    disable_vae_tiling = DiffusersStableDiffusionPipeline.disable_vae_tiling
+    enable_sequential_cpu_offload = DiffusersControlNetPipeline.enable_sequential_cpu_offload
+    enable_model_cpu_offload = DiffusersControlNetPipeline.enable_model_cpu_offload
+    _execution_device = DiffusersStableDiffusionPipeline._execution_device
+    _encode_prompt = DiffusersStableDiffusionPipeline._encode_prompt
+    run_safety_checker = DiffusersStableDiffusionPipeline.run_safety_checker
+    decode_latents = DiffusersStableDiffusionPipeline.decode_latents
+    prepare_extra_step_kwargs = DiffusersStableDiffusionPipeline.prepare_extra_step_kwargs
+    check_inputs = DiffusersControlNetPipeline.check_inputs
+    check_image = DiffusersControlNetPipeline.check_image
+    prepare_image = DiffusersControlNetPipeline.prepare_image
+    prepare_latents = DiffusersStableDiffusionPipeline.prepare_latents
+    def prepare_metadata(self, batch_size, metadata, ndims, do_classifier_free_guidance, device, dtype):
+        has_metadata = getattr(self.unet.config, "use_metadata", False)
+        num_metadata = getattr(self.unet.config, "num_metadata", 0)
+        if metadata is None and has_metadata and num_metadata > 0:
+            shape = (batch_size, num_metadata) if ndims == 2 else (batch_size, num_metadata, 1)
+            metadata = torch.zeros(shape, device=device, dtype=dtype)
+        if metadata is None:
+            return None
+        md = torch.as_tensor(metadata)
+        if ndims == 2:
+            assert (len(md.shape) == 1 and batch_size == 1) or (len(md.shape) == 2 and batch_size > 1)
+            if len(md.shape) == 1:
+                md = md.unsqueeze(0).expand(batch_size, -1)
+        elif ndims == 3:
+            assert (len(md.shape) == 2 and batch_size == 1) or (len(md.shape) == 3 and batch_size > 1)
+            if len(md.shape) == 2:
+                md = md.unsqueeze(0).expand(batch_size, -1, -1)
+        if do_classifier_free_guidance:
+            md = torch.cat([torch.zeros_like(md), md])
+        md = md.to(device=device, dtype=dtype)
+        return md
+    def _default_height_width(self, height, width, image):
+        while isinstance(image, list):
+            image = image[0]
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[2]
+            height = (height // 8) * 8
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[3]
+            width = (width // 8) * 8
+        return height, width
+    # override DiffusionPipeline
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        safe_serialization: bool = False,
+        variant: Optional[str] = None,
+    ):
+        # For single or multi controlnet, rely on default save logic.
+        super().save_pretrained(save_directory, safe_serialization=safe_serialization, variant=variant)
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        metadata: Optional[List[float]] = None,
+        cond_metadata: Optional[List[float]] = None,
+        is_temporal: bool = False,
+        conditioning_downsample: bool = True,
+    ):
+        # 0. Default height and width to unet
+        height, width = self._default_height_width(height, width, image)
+        cond_height, cond_width = height, width
+        if not conditioning_downsample:
+            cond_height, cond_width = height // 8, width // 8
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            controlnet_conditioning_scale,
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        do_classifier_free_guidance = guidance_scale > 1.0
+        from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+        if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 4. Prepare image
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        is_multi_cond = isinstance(image, list)
+        if (
+            hasattr(self.controlnet, "controlnet_cond_embedding")
+            or is_compiled
+            and hasattr(self.controlnet._orig_mod, "controlnet_cond_embedding")
+        ):
+            image = self.prepare_image(
+                image=image,
+                width=cond_width,
+                height=cond_height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=self.controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # CUSTOM metadata handling (auto-zero filled)
+        input_metadata = self.prepare_metadata(batch_size, metadata, 2, do_classifier_free_guidance, device, prompt_embeds.dtype)
+        ndims_cond = 3 if is_multi_cond else 2
+        cond_metadata = self.prepare_metadata(
+            batch_size, cond_metadata, ndims_cond, do_classifier_free_guidance, device, prompt_embeds.dtype
+        )
+        if input_metadata is not None:
+            assert len(input_metadata.shape) == 2 and input_metadata.shape[-1] == getattr(self.unet.config, "num_metadata", input_metadata.shape[-1])
+        if cond_metadata is not None:
+            assert len(cond_metadata.shape) == ndims_cond and cond_metadata.shape[1] == getattr(self.unet.config, "num_metadata", cond_metadata.shape[1])
+            if is_multi_cond and not is_temporal and not isinstance(self.controlnet, MultiControlNetModel):
+                assert cond_metadata.shape[2] == self.controlnet.controlnet_cond_embedding.conv_in.in_channels / 3
+        if input_metadata is not None:
+            assert input_metadata.shape[0] == prompt_embeds.shape[0]
+        if is_temporal:
+            num_cond = cond_metadata.shape[-1] if cond_metadata is not None else image.shape[1] // self.controlnet.config.conditioning_in_channels
+            image = einops.rearrange(image, 'b (t c) h w -> b c t h w', t=num_cond)
+        elif isinstance(self.controlnet, MultiControlNetModel) and cond_metadata is not None:
+            num_cond = cond_metadata.shape[-1] if cond_metadata is not None else image.shape[1] // self.controlnet.config.conditioning_in_channels
+            image = einops.rearrange(image, 'b (t c) h w -> t b c h w', t=num_cond)
+            image = [im for im in image]
+            cond_metadata = einops.rearrange(cond_metadata, 'b m t -> t b m')
+            cond_metadata = [cond_md for cond_md in cond_metadata]
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                if guess_mode and do_classifier_free_guidance:
+                    controlnet_latent_model_input = latents
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    controlnet_latent_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    controlnet_latent_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=image,
+                    metadata=input_metadata,
+                    cond_metadata=cond_metadata,
+                    conditioning_scale=controlnet_conditioning_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+                if guess_mode and do_classifier_free_guidance:
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    metadata=input_metadata,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    return_dict=False,
+                )[0]
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            image = self.decode_latents(latents)
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+            image = self.numpy_to_pil(image)
+        else:
+            image = self.decode_latents(latents)
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+__all__ = ["DiffusionSatControlNetPipeline"]

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "_class_name": "DDIMScheduler",
+  "_diffusers_version": "0.17.0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "clip_sample_range": 1.0,
+  "dynamic_thresholding_ratio": 0.995,
+  "num_train_timesteps": 1000,
+  "prediction_type": "v_prediction",
+  "rescale_betas_zero_snr": false,
+  "sample_max_value": 1.0,
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "thresholding": false,
+  "timestep_spacing": "leading",
+  "trained_betas": null
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "stabilityai/stable-diffusion-2-1",
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_size": 1024,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 23,
+  "pad_token_id": 1,
+  "projection_dim": 512,
+  "torch_dtype": "float16",
+  "transformers_version": "4.31.0",
+  "vocab_size": 49408
+}

text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc1827c465450322616f06dea41596eac7d493f4e95904dcb51f0fc745c4e13f
+size 680820392

tokenizer/config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "_class_name": "ControlNetModel",
+  "_diffusers_version": "0.17.0",
+  "_name_or_path": "/data/jiabo/diffusionsat/testoutput/checkpoint-1",
+  "act_fn": "silu",
+  "attention_head_dim": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "class_embed_type": null,
+  "conditioning_embedding_out_channels": [
+    16,
+    32,
+    96,
+    256
+  ],
+  "conditioning_in_channels": 3,
+  "conditioning_scale": 1,
+  "controlnet_conditioning_channel_order": "rgb",
+  "cross_attention_dim": 1024,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "global_pool_conditions": false,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "num_metadata": 7,
+  "only_cross_attention": false,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "upcast_attention": true,
+  "use_linear_projection": true,
+  "use_metadata": true
+}

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "!",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "model_max_length": 77,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

unet/config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "_class_name": ["sat_unet", "SatUNet"],
+  "_diffusers_version": "0.17.0",
+  "act_fn": "silu",
+  "attention_head_dim": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 1024,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dual_cross_attention": false,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "num_metadata": 7,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "resnet_time_scale_shift": "default",
+  "sample_size": 96,
+  "time_cond_proj_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": true,
+  "use_linear_projection": true,
+  "use_metadata": true
+}

unet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef6c0264f8eb5085b08e5f16631e1ad8ba078f28d94c902276f8dfc603e3eb80
+size 1760615624

unet/sat_unet.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""Satellite UNet wrapper with metadata support on top of diffusers."""
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from diffusers.models.unets.unet_2d_condition import (
+    UNet2DConditionModel,
+    UNet2DConditionOutput,
+)
+from diffusers.utils import logging
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class SatUNet(UNet2DConditionModel):
+    """Thin wrapper around `diffusers.UNet2DConditionModel` with metadata embeddings."""
+    _supports_gradient_checkpointing = True
+    def __init__(self, *args, use_metadata: bool = True, num_metadata: int = 7, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Track custom config entries for save/load parity with the base model.
+        self.register_to_config(use_metadata=use_metadata, num_metadata=num_metadata)
+        self.use_metadata = use_metadata
+        self.num_metadata = num_metadata
+        if use_metadata:
+            # Re-use the same dimensions as the base time embedding.
+            timestep_input_dim = self.time_embedding.linear_1.in_features
+            time_embed_dim = self.time_embedding.linear_2.out_features
+            self.metadata_embedding = nn.ModuleList(
+                [self._build_metadata_embedding(timestep_input_dim, time_embed_dim) for _ in range(num_metadata)]
+            )
+        else:
+            self.metadata_embedding = None
+    @staticmethod
+    def _build_metadata_embedding(timestep_input_dim: int, time_embed_dim: int) -> nn.Module:
+        from diffusers.models.embeddings import TimestepEmbedding
+        return TimestepEmbedding(timestep_input_dim, time_embed_dim)
+    def _encode_metadata(
+        self, metadata: Optional[torch.Tensor], dtype: torch.dtype
+    ) -> Optional[torch.Tensor]:
+        if self.metadata_embedding is None:
+            return None
+        if metadata is None:
+            raise ValueError("metadata must be provided when use_metadata=True")
+        if metadata.dim() != 2 or metadata.shape[1] != self.num_metadata:
+            raise ValueError(f"Invalid metadata shape {metadata.shape}, expected (batch, {self.num_metadata})")
+        md_bsz = metadata.shape[0]
+        # Reuse the same projection used for timestep encoding to stay aligned with base embeddings.
+        projected = self.time_proj(metadata.view(-1)).view(md_bsz, self.num_metadata, -1).to(dtype=dtype)
+        md_emb = projected.new_zeros((md_bsz, projected.shape[-1]))
+        for idx, md_embed in enumerate(self.metadata_embedding):
+            md_emb = md_emb + md_embed(projected[:, idx, :])
+        return md_emb
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        metadata: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        # Largely mirrors `UNet2DConditionModel.forward` with a metadata injection on the timestep embedding.
+        default_overall_up_factor = 2**self.num_upsamplers
+        forward_upsample_size = False
+        upsample_size = None
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                forward_upsample_size = True
+                break
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        t_emb = self.get_time_embed(sample=sample, timestep=timestep)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
+        if class_emb is not None:
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        aug_emb = self.get_aug_embed(
+            emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs or {}
+        )
+        if self.config.addition_embed_type == "image_hint" and aug_emb is not None:
+            aug_emb, hint = aug_emb
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        md_emb = self._encode_metadata(metadata=metadata, dtype=sample.dtype)
+        if md_emb is not None:
+            emb = emb + md_emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        encoder_hidden_states = self.process_encoder_hidden_states(
+            encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs or {}
+        )
+        sample = self.conv_in(sample)
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+        if cross_attention_kwargs is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            lora_scale = cross_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        from diffusers.utils import USE_PEFT_BACKEND, scale_lora_layers, unscale_lora_layers, deprecate
+        if USE_PEFT_BACKEND:
+            scale_lora_layers(self, lora_scale)
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        is_adapter = down_intrablock_additional_residuals is not None
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated "
+                "and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used "
+                "for ControlNet. Please use `down_intrablock_additional_residuals` instead.",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                additional_residuals: Dict[str, torch.Tensor] = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                )
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if USE_PEFT_BACKEND:
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (sample,)
+        return UNet2DConditionOutput(sample=sample)

vae/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.17.0",
+  "_name_or_path": "stabilityai/stable-diffusion-2-1",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 768,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e4c08995484ee61270175e9e7a072b66a6e4eeb5f0c266667fe1f45b90daf9a
+size 167335342