opendiffusionai
/

stablediffusionxl_t5

@@ -1,115 +1,185 @@
-# pipeline.py
-# subclass SD pipeline to replace CLIP-L with T5
-import torch
-import torch.nn as nn
-from transformers import AutoTokenizer, AutoModel
-from transformers import T5Tokenizer, T5EncoderModel
-from diffusers import StableDiffusionPipeline
-from diffusers.utils import logging
-logger = logging.get_logger(__name__)
-T5_NAME="mcmonkey/google_t5-v1_1-xxl_encoderonly"
 class LinearWithDtype(nn.Linear):
     @property
     def dtype(self):
         return self.weight.dtype
-class StableDiffusionT5Pipeline(StableDiffusionPipeline):
-    # override this so we can auto-init text_encoder
-    #_optional_components = ["safety_checker", "feature_extractor", "image_encoder", "text_encoder"]
-    # t5_projection not really optional, but needed it here to stop internal whining
-    _optional_components = StableDiffusionPipeline._optional_components + ["text_encoder", "t5_projection"]
     def __init__(
         self,
-        vae,
-        text_encoder,
-        tokenizer,
-        unet,
-        scheduler,
-        safety_checker=None,
-        feature_extractor=None,
-        image_encoder=None,
-        requires_safety_checker=True,
-        t5_projection: LinearWithDtype=None,
     ):
-        self.tokenizer = (
-            tokenizer
-            if tokenizer is not None
-            else T5Tokenizer.from_pretrained(T5_NAME,torch_dtype=unet.dtype)
-        )
-        if text_encoder is None:
-            self.text_encoder = T5EncoderModel.from_pretrained(T5_NAME, torch_dtype=unet.dtype)
         else:
-            self.text_encoder     = text_encoder
-        super().__init__(
             vae=vae,
-            tokenizer=self.tokenizer,
-            text_encoder=self.text_encoder,
             unet=unet,
             scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
             image_encoder=image_encoder,
-            requires_safety_checker=requires_safety_checker,
         )
-        if t5_projection is None:
-            self.t5_projection = LinearWithDtype(4096, 768).to(unet.device, dtype=unet.dtype)
-        else:
-            self.t5_projection = t5_projection
-        # Ensure everything is properly registered for to("cuda")
-        self.register_modules(t5_projection=self.t5_projection)
     def encode_prompt(
         self,
         prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt = None,
-        prompt_embeds = None,
-        negative_prompt_embeds = None,
-        lora_scale = None,
-        clip_skip = None,  # ignore with T5!!
-        **kwargs,
     ):
-        def _tok(text):
-            out = self.tokenizer(
                 text,
                 return_tensors="pt",
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
-            )
-            return out.input_ids.to(device=device, dtype=torch.long), out.attention_mask.to(device)
-        pos_ids, pos_mask = _tok(prompt)
-        pos_hidden = self.text_encoder(pos_ids, attention_mask=pos_mask).last_hidden_state
-        pos_embeds = self.t5_projection(pos_hidden)
-        if do_classifier_free_guidance:
-            neg_prompt = negative_prompt if negative_prompt is not None else ""
-            neg_ids, neg_mask = _tok(neg_prompt)
-            neg_hidden = self.text_encoder(neg_ids, attention_mask=neg_mask).last_hidden_state
-            neg_embeds = self.t5_projection(neg_hidden)
-            # Expand for multiple images per prompt
-            pos_embeds = pos_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            neg_embeds = neg_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            return [neg_embeds, pos_embeds]
         else:
-            pos_embeds = pos_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            return pos_embeds

+# Copyright Philip Brown, ppbrown@github
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Note: At this time, the intent is to use the T5 encoder mentioned
+# below, with zero changes.
+# Therefore, the model deliberately does not store the T5 encoder model bytes,
+# (Since they are not unique!)
+# but instead takes advantage of huggingface hub cache loading
+T5_NAME  = "mcmonkey/google_t5-v1_1-xxl_encoderonly"
+from diffusers import StableDiffusionXLPipeline, DiffusionPipeline
+from transformers import T5Tokenizer, T5EncoderModel
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from typing import Optional
+import torch.nn as nn, torch, types
+import torch.nn as nn
 class LinearWithDtype(nn.Linear):
     @property
     def dtype(self):
         return self.weight.dtype
+class StableDiffusionXL_T5Pipeline(StableDiffusionXLPipeline):
+    _expected_modules = [
+        "vae", "unet", "scheduler", "tokenizer",
+        "image_encoder", "feature_extractor",
+        "t5_encoder", "t5_projection", "t5_pooled_projection",
+    ]
+    _optional_components = [
+        "image_encoder", "feature_extractor",
+        "t5_encoder", "t5_projection", "t5_pooled_projection",
+    ]
     def __init__(
         self,
+        vae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        tokenizer: T5Tokenizer,
+        t5_encoder=None,
+        t5_projection=None,
+        t5_pooled_projection=None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
     ):
+        DiffusionPipeline.__init__(self)
+        if t5_encoder is None:
+            self.t5_encoder     = T5EncoderModel.from_pretrained(T5_NAME,
+                                torch_dtype=unet.dtype)
+        else:
+            self.t5_encoder     = t5_encoder
+        # ----- build T5 4096 => 2048 dim projection -----
+        if t5_projection is None:
+            self.t5_projection  = LinearWithDtype(4096, 2048)   # trainable
         else:
+            self.t5_projection  = t5_projection
+        self.t5_projection.to(dtype=unet.dtype)
+        # ----- build T5 4096 => 1280 dim projection -----
+        if t5_pooled_projection is None:
+            self.t5_pooled_projection  = LinearWithDtype(4096, 1280)   # trainable
+        else:
+            self.t5_pooled_projection  = t5_pooled_projection
+        self.t5_pooled_projection.to(dtype=unet.dtype)
+        print("dtype of Linear is ",self.t5_projection.dtype)
+        self.register_modules(
             vae=vae,
             unet=unet,
             scheduler=scheduler,
+            tokenizer=tokenizer,
+            t5_encoder=self.t5_encoder,
+            t5_projection=self.t5_projection,
+            t5_pooled_projection=self.t5_pooled_projection,
             image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_sample_size = (
+            self.unet.config.sample_size
+            if hasattr(self, "unet") and self.unet is not None and hasattr(self.unet.config, "sample_size")
+            else 128
         )
+        self.watermark = None
+        # Parts of original SDXL class complain if these attributes are not
+        # at least PRESENT
+        self.text_encoder = self.text_encoder_2 = None
+    # ------------------------------------------------------------------------
+    #  Encode a text prompt
+    #  Use + 4096 => 2048 projection for standard embeds, but
+    #   4096 => 1280 for pooled embeds, because that's what the unet requires.
+    #  Returns exactly four tensors in the order SDXL's __call__ expects.
+    # ------------------------------------------------------------------------
     def encode_prompt(
         self,
         prompt,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str | None = None,
+        **_,
     ):
+        """
+        Returns
+        -------
+        prompt_embeds                : Tensor [B, T, 2048]
+        negative_prompt_embeds       : Tensor [B, T, 2048] | None
+        pooled_prompt_embeds         : Tensor [B, 1280]
+        negative_pooled_prompt_embeds: Tensor [B, 1280]    | None
+        where B = batch * num_images_per_prompt
+        """
+        # --- helper to tokenize on the pipeline's device ----------------
+        def _tok(text: str):
+            tok_out = self.tokenizer(
                 text,
                 return_tensors="pt",
                 padding="max_length",
                 max_length=self.tokenizer.model_max_length,
                 truncation=True,
+            ).to(self.device)
+            return tok_out.input_ids, tok_out.attention_mask
+        # ---------- positive stream -------------------------------------
+        ids, mask = _tok(prompt)
+        h_pos = self.t5_encoder(ids, attention_mask=mask).last_hidden_state   # [b, T, 4096]
+        tok_pos = self.t5_projection(h_pos)                                   # [b, T, 2048]
+        pool_pos = self.t5_pooled_projection(h_pos.mean(dim=1))               # [b, 1280]
+        # expand for multiple images per prompt
+        tok_pos   = tok_pos.repeat_interleave(num_images_per_prompt, 0)
+        pool_pos  = pool_pos.repeat_interleave(num_images_per_prompt, 0)
+        # ---------- negative / CFG stream --------------------------------
+        if do_classifier_free_guidance:
+            neg_text = "" if negative_prompt is None else negative_prompt
+            ids_n, mask_n = _tok(neg_text)
+            h_neg = self.t5_encoder(ids_n, attention_mask=mask_n).last_hidden_state
+            tok_neg = self.t5_projection(h_neg)
+            pool_neg = self.t5_pooled_projection(h_neg.mean(dim=1))
+            tok_neg  = tok_neg.repeat_interleave(num_images_per_prompt, 0)
+            pool_neg = pool_neg.repeat_interleave(num_images_per_prompt, 0)
         else:
+            tok_neg = pool_neg = None
+        return tok_pos, tok_neg, pool_pos, pool_neg