BRIA-3.2-API

Running

App Files Files Community

Eyalgut commited on Mar 11

Commit

8baa624

verified ·

1 Parent(s): 36c6ec5

Update pipeline_bria.py

Browse files

Files changed (1) hide show

pipeline_bria.py +128 -28

pipeline_bria.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from diffusers.pipelines.flux.pipeline_flux import FluxPipeline, retrieve_timesteps
 from typing import Any, Callable, Dict, List, Optional, Union
 import torch
@@ -25,7 +25,9 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
 from transformer_bria import BriaTransformer2DModel
 from bria_utils import get_t5_prompt_embeds, get_original_sigmas, is_ng_none
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
@@ -78,10 +80,6 @@ class BriaPipeline(FluxPipeline):
             [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
     """
-    # model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder->transformer->vae"
-    # _optional_components = []
-    # _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
     def __init__(
         self,
         transformer: BriaTransformer2DModel,
@@ -109,6 +107,11 @@ class BriaPipeline(FluxPipeline):
         self.text_encoder = self.text_encoder.to(dtype=T5_PRECISION)
         for block in self.text_encoder.encoder.block:
             block.layer[-1].DenseReluDense.wo.to(dtype=torch.float32)
     def encode_prompt(
         self,
@@ -326,10 +329,10 @@ class BriaPipeline(FluxPipeline):
         Examples:
-        Returns:
-            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
-            `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
         height = height or self.default_sample_size * self.vae_scale_factor
@@ -382,16 +385,7 @@ class BriaPipeline(FluxPipeline):
         if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-        # 4. Prepare timesteps
-        # Sample from training sigmas
-        if isinstance(self.scheduler,DDIMScheduler) or isinstance(self.scheduler,EulerAncestralDiscreteScheduler):
-            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None, None)
-        else:
-            sigmas = get_original_sigmas(num_train_timesteps=self.scheduler.config.num_train_timesteps,num_inference_steps=num_inference_steps)
-            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps,sigmas=sigmas)
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
-        self._num_timesteps = len(timesteps)
         # 5. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4 # due to patch=2, we devide by 4
@@ -406,9 +400,42 @@ class BriaPipeline(FluxPipeline):
             latents,
         )
         # Supprot different diffusers versions
-        if len(latent_image_ids.shape)==2:
-            text_ids=text_ids.squeeze()
         # 6. Denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -452,12 +479,6 @@ class BriaPipeline(FluxPipeline):
                 latents_dtype = latents.dtype
                 latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-                # if latents.std().item()>2:
-                #     print('Warning')
-                # print(t.item(),latents.mean().item(),latents.std().item())
                 if latents.dtype != latents_dtype:
                     if torch.backends.mps.is_available():
                         # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
@@ -554,9 +575,88 @@ class BriaPipeline(FluxPipeline):
         self.text_encoder = self.text_encoder.to(dtype=T5_PRECISION)
         for block in self.text_encoder.encoder.block:
             block.layer[-1].DenseReluDense.wo.to(dtype=torch.float32)
         return self

+from diffusers.pipelines.flux.pipeline_flux import FluxPipeline, retrieve_timesteps, calculate_shift
 from typing import Any, Callable, Dict, List, Optional, Union
 import torch
 from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
 from transformer_bria import BriaTransformer2DModel
 from bria_utils import get_t5_prompt_embeds, get_original_sigmas, is_ng_none
+from diffusers.utils.torch_utils import randn_tensor
+import diffusers
+import numpy as np
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
             [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
     """
     def __init__(
         self,
         transformer: BriaTransformer2DModel,
         self.text_encoder = self.text_encoder.to(dtype=T5_PRECISION)
         for block in self.text_encoder.encoder.block:
             block.layer[-1].DenseReluDense.wo.to(dtype=torch.float32)
+        if self.vae.config.shift_factor is None:
+            self.vae.config.shift_factor=0
+            self.vae.to(dtype=torch.float32)
     def encode_prompt(
         self,
         Examples:
+          Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
         """
         height = height or self.default_sample_size * self.vae_scale_factor
         if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
         # 5. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4 # due to patch=2, we devide by 4
             latents,
         )
+        if  isinstance(self.scheduler,FlowMatchEulerDiscreteScheduler) and self.scheduler.config['use_dynamic_shifting']:
+            sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+            image_seq_len = latents.shape[1] # Shift by height - Why just height?
+            print(f"Using dynamic shift in pipeline with sequence length {image_seq_len}")
+            mu = calculate_shift(
+                image_seq_len,
+                self.scheduler.config.base_image_seq_len,
+                self.scheduler.config.max_image_seq_len,
+                self.scheduler.config.base_shift,
+                self.scheduler.config.max_shift,
+            )
+            timesteps, num_inference_steps = retrieve_timesteps(
+                self.scheduler,
+                num_inference_steps,
+                device,
+                timesteps,
+                sigmas,
+                mu=mu,
+            )
+        else:
+            # 4. Prepare timesteps
+            # Sample from training sigmas
+            if isinstance(self.scheduler,DDIMScheduler) or isinstance(self.scheduler,EulerAncestralDiscreteScheduler):
+                timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None, None)
+            else:
+                sigmas = get_original_sigmas(num_train_timesteps=self.scheduler.config.num_train_timesteps,num_inference_steps=num_inference_steps)
+                timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps,sigmas=sigmas)
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
         # Supprot different diffusers versions
+        if diffusers.__version__>='0.32.0':
+            latent_image_ids=latent_image_ids[0]
+            text_ids=text_ids[0]
         # 6. Denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
                 latents_dtype = latents.dtype
                 latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
                 if latents.dtype != latents_dtype:
                     if torch.backends.mps.is_available():
                         # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
         self.text_encoder = self.text_encoder.to(dtype=T5_PRECISION)
         for block in self.text_encoder.encoder.block:
             block.layer[-1].DenseReluDense.wo.to(dtype=torch.float32)
+        if self.vae.config.shift_factor == 0 and self.vae.dtype!=torch.float32:
+            self.vae.to(dtype=torch.float32)
         return self
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor )
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+        return latents, latent_image_ids
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+        return latents
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.repeat(batch_size, 1, 1, 1)
+        latent_image_ids = latent_image_ids.reshape(
+            batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)