Spaces:

Shaoan
/

ConceptAligner

Running on Zero

App Files Files Community

Shaoan commited on about 15 hours ago

Commit

642f8f3

verified ·

1 Parent(s): d2f46ae

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

aligner.py +861 -0
app.py +210 -0
empty_pooled_clip.pt +3 -0
pipeline.py +641 -0
requirements.txt +8 -0
requirements.txt.py +8 -0
text_encoder.py +1188 -0

aligner.py ADDED Viewed

	@@ -0,0 +1,861 @@

+import torch
+from torch import nn
+from refiner import Qwen2Connector
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self, embed_dim=2560, num_heads=20):
+        super().__init__()
+        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        # Linear projections for Q, K, V
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        # Output projection
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.scale = self.head_dim ** -0.5
+    def forward(self, x, mask=None, return_attention=True):
+        """
+        Args:
+            x: Input tensor of shape [b, seq_len, embed_dim]
+            mask: Attention mask of shape [b, seq_len], where 1 means attend, 0 means ignore
+            return_attention: Whether to return attention weights
+        Returns:
+            output: [b, seq_len, embed_dim]
+            attn_weights: [b*num_heads, seq_len, seq_len] (if return_attention=True)
+        """
+        b, seq_len, embed_dim = x.shape
+        # Project to Q, K, V
+        Q = self.q_proj(x)  # [b, seq_len, embed_dim]
+        K = self.k_proj(x)  # [b, seq_len, embed_dim]
+        V = self.v_proj(x)  # [b, seq_len, embed_dim]
+        # Reshape and transpose for multi-head attention
+        # [b, seq_len, embed_dim] -> [b, seq_len, num_heads, head_dim] -> [b, num_heads, seq_len, head_dim]
+        Q = Q.view(b, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        K = K.view(b, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        V = V.view(b, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # Reshape for batch computation: [b, num_heads, seq_len, head_dim] -> [b*num_heads, seq_len, head_dim]
+        Q = Q.reshape(b * self.num_heads, seq_len, self.head_dim)
+        K = K.reshape(b * self.num_heads, seq_len, self.head_dim)
+        V = V.reshape(b * self.num_heads, seq_len, self.head_dim)
+        # Compute attention scores: Q @ K^T
+        attn_scores = torch.bmm(Q, K.transpose(1, 2)) * self.scale  # [b*num_heads, seq_len, seq_len]
+        # Apply mask if provided
+        if mask is not None:
+            # Key mask (column masking): which keys can be attended to
+            key_mask = mask.unsqueeze(1).unsqueeze(2)  # [b, 1, 1, seq_len]
+            # Query mask (row masking): which queries are valid
+            query_mask = mask.unsqueeze(1).unsqueeze(3)  # [b, 1, seq_len, 1]
+            # Combine both masks: a position can attend only if BOTH query and key are valid
+            # Shape: [b, 1, seq_len, seq_len]
+            final_mask = query_mask.bool() & key_mask.bool()  # Broadcasting handles the dimensions
+            # Expand to all heads and reshape
+            final_mask = final_mask.expand(b, self.num_heads, seq_len, seq_len)
+            final_mask = final_mask.reshape(b * self.num_heads, seq_len, seq_len)
+            attn_scores = attn_scores.masked_fill(~final_mask, float('-inf'))
+        # Apply softmax
+        attn_weights = F.softmax(attn_scores, dim=-1)  # [b*num_heads, seq_len, seq_len]
+        # Handle NaN from softmax (when entire row is -inf)
+        attn_weights = torch.nan_to_num(attn_weights, nan=0.0)
+        # Apply attention to values
+        attn_output = torch.bmm(attn_weights, V)  # [b*num_heads, seq_len, head_dim]
+        # Reshape back: [b*num_heads, seq_len, head_dim] -> [b, num_heads, seq_len, head_dim]
+        attn_output = attn_output.view(b, self.num_heads, seq_len, self.head_dim)
+        # Transpose and reshape: [b, num_heads, seq_len, head_dim] -> [b, seq_len, num_heads, head_dim] -> [b, seq_len, embed_dim]
+        attn_output = attn_output.transpose(1, 2).contiguous().view(b, seq_len, embed_dim)
+        # Final output projection
+        output = self.out_proj(attn_output)  # [b, seq_len, embed_dim]
+        if return_attention:
+            return output, attn_weights  # attn_weights is [b*num_heads, seq_len, seq_len]
+        else:
+            return output
+class ConceptAligner222(nn.Module):
+    def __init__(self, custom_pool=1, input_dim=2560, hidden_size=2560):
+        super().__init__()
+        if input_dim == 2560:
+            hidden_size = 2560
+            self.num_heads = 20
+            self.model_class = 'gemma3'
+            depth = 2
+            identity_mapping = False
+        elif input_dim == 4096:
+            hidden_size = 3072
+            self.num_heads = 24
+            self.model_class = 't5'
+            depth = 1
+            identity_mapping = True
+        self.text_connector = Qwen2Connector(in_channels=input_dim, hidden_size=hidden_size, heads_num=self.num_heads,
+                                             depth=depth, identity_init=identity_mapping)
+        self.final_proj = nn.Sequential(nn.Linear(hidden_size, 4096), nn.SiLU(), nn.Linear(4096, 4096))
+        self.resampler = MultiHeadSelfAttention(embed_dim=hidden_size, num_heads=self.num_heads)
+        empty_pooled_clip = torch.load('empty_pooled_clip.pt', map_location='cpu')
+        self.register_buffer('empty_pooled_clip', empty_pooled_clip)
+        self.learnable_scale_norm = nn.Parameter(torch.ones([1, 1, 1]) * 0.01, requires_grad=True)
+        self.proj_norm = nn.LayerNorm(hidden_size)
+        self.custom_pool = custom_pool
+        if self.custom_pool:
+            self.clip_proj = nn.Sequential(nn.Linear(hidden_size, hidden_size * 3), nn.SiLU(),
+                                           nn.Linear(hidden_size * 3, 768))
+            self.clip_norm = nn.LayerNorm(768)
+            print('Using custom pooling for CLIP features.')
+    @property
+    def dtype(self):
+        """Return the dtype of the model parameters."""
+        # return next(self.parameters()).dtype
+        return torch.bfloat16
+    @property
+    def device(self):
+        """Return the device of the model parameters."""
+        # return next(self.parameters()).device
+        return self.empty_pooled_clip.device
+    def forward(self, text_features, text_mask, is_training=False, img_seq_len=1024):
+        text_features = self.text_connector(text_features, mask=text_mask,
+                                            mean_start_id=2 if self.model_class == 'gemma' else 0)
+        text_features = self.proj_norm(text_features)
+        aligned_features, attn = self.resampler(text_features, mask=text_mask, return_attention=True)
+        if is_training:
+            learnable_scale = torch.clip(self.learnable_scale_norm, -1.0, 1.0)
+            visual_concepts = aligned_features + learnable_scale * torch.randn_like(aligned_features)
+        else:
+            visual_concepts = aligned_features
+        prompt_embeds = self.final_proj(visual_concepts)
+        # prompt_embeds = text_features
+        if self.custom_pool:
+            mean_features = (aligned_features * text_mask.unsqueeze(-1)).sum(dim=1) / (
+                        text_mask.sum(dim=1, keepdim=True) + 1e-8)
+            pooled_prompt_embeds = self.clip_proj(mean_features)
+            pooled_prompt_embeds = self.clip_norm(pooled_prompt_embeds)
+        else:
+            pooled_prompt_embeds = self.empty_pooled_clip.expand(text_features.shape[0], -1)
+        dtype = prompt_embeds.dtype
+        device = prompt_embeds.device
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        total_seq_len = img_seq_len + prompt_embeds.shape[1]
+        text_seq_len = text_mask.shape[1]
+        attention_mask = torch.zeros(
+            len(text_features), 1, 1, total_seq_len,
+            device=text_mask.device,
+            dtype=text_mask.dtype
+        )
+        # Fill in text portion: where text_mask==0, set to -inf
+        attention_mask[:, :, :, :text_seq_len] = (1 - text_mask).unsqueeze(1).unsqueeze(2) * -10000.0
+        entropy = -(attn * torch.log(attn + 1e-8)).sum(dim=-1)
+        mask_expanded = text_mask.unsqueeze(1).repeat(1, self.num_heads, 1)
+        mask_expanded = mask_expanded.reshape(len(text_features) * self.num_heads, text_seq_len)
+        valid_entropy = entropy[mask_expanded.bool()]
+        return prompt_embeds, attention_mask, pooled_prompt_embeds, text_ids, valid_entropy
+        # return prompt_embeds, pooled_prompt_embeds, text_ids, None
+import torch
+import torch.nn as nn
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class AdaLayerNorm(nn.Module):
+    def __init__(self, embedding_dim: int, time_embedding_dim=4096):
+        super().__init__()
+        if time_embedding_dim is None:
+            time_embedding_dim = embedding_dim
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(time_embedding_dim, 2 * embedding_dim, bias=True)
+        nn.init.normal_(self.linear.weight, mean=0, std=0.02)
+        nn.init.zeros_(self.linear.bias)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+    def forward(
+            self, x: torch.Tensor, timestep_embedding: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        emb = self.linear(self.silu(timestep_embedding))
+        shift, scale = emb.unsqueeze(1).chunk(2, dim=-1)
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+class GateMLP(nn.Module):
+    def __init__(self, gate_mode='soft', input_dim=64, hidden_dim=1024):
+        super().__init__()
+        self.gate_mode = gate_mode
+        hidden_dim = max(input_dim, min(hidden_dim, 512))
+        hidden_dim = 512
+        self.input_norm = nn.LayerNorm(4096)
+        self.norm0 = nn.LayerNorm(input_dim)
+        self.linear1 = nn.Linear(input_dim, hidden_dim)
+        self.activation1 = nn.GELU()
+        self.linear2 = nn.Linear(hidden_dim+4096, hidden_dim)
+        self.activation2 = nn.GELU()
+        self.linear3 = nn.Linear(hidden_dim+4096, hidden_dim)
+        self.activation3 = nn.GELU()
+        self.final_linear = nn.Linear(hidden_dim, 1)
+        nn.init.xavier_uniform_(self.linear1.weight)
+        nn.init.zeros_(self.linear1.bias)
+        nn.init.xavier_uniform_(self.linear2.weight)
+        nn.init.zeros_(self.linear2.bias)
+        nn.init.xavier_uniform_(self.linear3.weight)
+        nn.init.zeros_(self.linear3.bias)
+        nn.init.zeros_(self.final_linear.weight)
+        bias_val = 0.0 if 'soft' in gate_mode else 1.0
+        nn.init.constant_(self.final_linear.bias, bias_val)
+    def forward(self, x):
+        y = x.transpose(1, 2).flatten(2)
+        y = self.input_norm(y.detach()).unsqueeze(1).repeat(1, x.shape[1],1,1)
+        x = self.linear1(self.norm0(x.detach()))
+        x = self.activation1(x)
+        x = self.linear2(torch.cat([x, y], dim=-1))
+        x = self.activation2(x)
+        x = self.linear3(torch.cat([x,y], dim=-1))
+        x = self.activation3(x)
+        x = self.final_linear(x)
+        return x
+class CrossAttentionWithInfluence(nn.Module):
+    def __init__(self, d_model=4096, num_heads=32, gate_mode='hard'):
+        super().__init__()
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.head_dim = d_model // num_heads
+        self.gate_mode = gate_mode
+        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
+        # Linear projections for Q, K, V
+        # self.q_proj = nn.Linear(d_model, d_model)
+        # self.k_proj = nn.Linear(d_model, d_model)
+        self.v_proj = nn.Linear(d_model, d_model)
+        self.out_proj = nn.Linear(d_model, d_model)
+        # nn.init.normal_(self.q_proj.weight, mean=0, std=0.02)
+        # nn.init.normal_(self.k_proj.weight, mean=0, std=0.02)
+        # nn.init.zeros_(self.q_proj.bias)
+        # nn.init.zeros_(self.k_proj.bias)
+        nn.init.eye_(self.out_proj.weight)
+        nn.init.zeros_(self.out_proj.bias)
+        nn.init.eye_(self.v_proj.weight)
+        nn.init.zeros_(self.v_proj.bias)
+        self.mask_mlp = GateMLP(input_dim=d_model // num_heads, hidden_dim=1024, gate_mode=gate_mode)
+        self.scale = self.head_dim ** -0.5
+        # self.learnable_scale_norm = nn.Parameter(torch.ones([1, 1,1,1])*0.01, requires_grad=True)
+        self.rec_mlp = nn.Sequential(nn.Linear(4096, 4096), nn.SiLU(),
+                                     nn.Linear(4096, 4096), nn.SiLU(),
+                                     nn.Linear(4096, 4096)
+                                     )
+    def forward(self, x, y, y_mask, temperature=None, threshold=None, topk=None):
+        """
+        Args:
+            x: shared embedding [b, 300, 4096]
+            y: changing embedding [b, 300, 4096]
+        Returns:
+            output: [b, 300, 4096]
+            y_influence: [b, 32, 300, 300] - influence from y to x
+        """
+        b, seq_len_x, d_model = x.shape
+        b, seq_len_y, d_model_y = y.shape
+        """
+        # Q from x only
+        Q = self.q_proj(x)  # [b, 300, 4096]
+        seq_len = Q.shape[1]
+        # K, V from concatenation of [x, y]
+        K = self.k_proj(x)  # [b, 300, 4096]
+        # Reshape for multi-head attention
+        Q = Q.view(b, Q.shape[1], self.num_heads, self.head_dim).transpose(1, 2)  # [b, 32, 300, 128]
+        K = K.view(b, K.shape[1], self.num_heads, self.head_dim).transpose(1, 2)  # [b, 32, 600, 128]
+        """
+        V = self.v_proj(y)  # [b, 300, 4096]
+        shared_V = self.v_proj(x)  # [b, 300, 4096]
+        textual_concepts = V.view(b, V.shape[1], self.num_heads, self.head_dim).transpose(1, 2)  # [b, 32, 300, 128]
+        shared_concepts = shared_V.view(b, shared_V.shape[1], self.num_heads, self.head_dim).transpose(1,
+                                                                                                       2)  # [b, 32, 300, 128]
+        expand_y_mask = y_mask.unsqueeze(1).unsqueeze(-1)  # [b, 1, 300, 1]
+        # Compute attention scores
+        """
+        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale  # [b, 32, 300, 300]
+        attn_weights = F.softmax(attn_scores, dim=-1)  # [b, 32, 300, 300]
+        # Compute output
+        attn_output = torch.matmul(attn_weights, textual_concepts)  # [b, 32, 300, 128]
+        """
+        diagonal_influence = self.mask_mlp((textual_concepts))
+        if 'soft' in self.gate_mode:
+            diagonal_influence = 2 * (torch.sigmoid(diagonal_influence * temperature))  # [b, 32, 300, 1]
+            diagonal_influence = (diagonal_influence > 0.1).to(
+                diagonal_influence.dtype) * diagonal_influence  # Thresholding
+            soft_influence = diagonal_influence
+        else:
+            soft_influence = torch.sigmoid(diagonal_influence * temperature)
+            if threshold is None:
+                threshold = 0.5
+            else:
+                print('Using custom threshold for influence gating:', threshold)
+            hard_influence = (soft_influence >= threshold)
+            diagonal_influence = hard_influence + soft_influence - soft_influence.detach()  # Straight-through estimator
+        if topk is not None:
+            print(diagonal_influence.shape, ' <<< shape before topk ')
+            top_k_values, top_k_indices = torch.topk(diagonal_influence, topk, dim=1)
+            result = torch.zeros_like(diagonal_influence)
+            result.scatter_(1, top_k_indices, top_k_values)
+            diagonal_influence = result
+            print('Applied top-k sparsification on influence gates with k=', topk)
+        diagonal_output = textual_concepts * diagonal_influence + shared_concepts * (
+                    1 - diagonal_influence)  # [b, 32, 300, 128]
+        da,db,dc,dd = diagonal_output.shape
+        rec_diagonal = self.rec_mlp(diagonal_output.transpose(1,2).flatten(2)[y_mask.bool()].to(x.dtype))
+        tgt_diagonal = y[y_mask.bool()]
+        diagonal_output = expand_y_mask * diagonal_output + (1 - expand_y_mask) * shared_concepts  # [b, 32, 300, 128]
+        mask_bool_expanded = expand_y_mask.expand_as(diagonal_influence).bool()  # [b, 32, 300, 1]
+        meaningful_gates = diagonal_influence[mask_bool_expanded]
+        soft_meaningful_gate = soft_influence[mask_bool_expanded]
+        # full_output = self.learnable_scale_norm*attn_output + diagonal_output  # [b, 32, 300, 128]
+        full_output = diagonal_output.to(x.dtype)
+        # Reshape back
+        full_output = full_output.transpose(1, 2).contiguous().view(b, y.shape[1], d_model)  # [b, 300, 4096]
+        full_output = full_output  # Residual connection
+        # Final output projection
+        output = self.out_proj(full_output)  # [b, 300, 4096]
+        return output, diagonal_influence.squeeze(-1).transpose(1, 2), meaningful_gates, soft_meaningful_gate, rec_diagonal, tgt_diagonal
+def init_weights_gaussian(model, mean=0.0, std=0.02):
+    """
+    Initialize all nn.Linear layers in the model:
+    - weights with Gaussian(mean, std)
+    - biases to 0
+    """
+    for m in model.modules():
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, mean=mean, std=std)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0.0)
+class ConceptAligner(nn.Module):
+    def __init__(self, per_dim=4):
+        super().__init__()
+        empty_pooled_clip = torch.load('empty_pooled_clip.pt', map_location='cpu')
+        self.register_buffer('empty_pooled_clip', empty_pooled_clip)
+        test_eps = torch.randn([1, 300, per_dim], dtype=torch.bfloat16).to('cpu')*0.7
+        self.register_buffer('test_eps', test_eps)
+        self.init_proj = nn.Sequential(nn.Linear(768, 300*16), nn.SiLU())
+        self.proj = nn.Sequential(nn.Linear(16, 1024), nn.SiLU(),
+                                  nn.Linear(1024, 1024), nn.SiLU())
+        self.text_proj = nn.Sequential(nn.Linear(4096, 1024), nn.SiLU(),
+                                   nn.Linear(1024, 1024), nn.SiLU())
+        self.proj_mu = nn.Sequential(nn.Linear(1024, per_dim))
+        self.proj_logvar = nn.Sequential(nn.Linear(1024, per_dim))
+        self.eps_proj = nn.Sequential(nn.Linear(per_dim, 1024), nn.SiLU(),
+                                      nn.LayerNorm(1024),
+                                      nn.Linear(1024, 4096))
+        init_weights_gaussian(self, mean=0.0, std=0.02)
+        torch.nn.init.constant_(self.eps_proj[-1].weight, 0.0)
+        torch.nn.init.constant_(self.eps_proj[-1].bias, 0.0)
+    @property
+    def dtype(self):
+        """Return the dtype of the model parameters."""
+        # return next(self.parameters()).dtype
+        return torch.bfloat16
+    @property
+    def device(self):
+        """Return the device of the model parameters."""
+        # return next(self.parameters()).device
+        return self.empty_pooled_clip.device
+    def forward(self, text_features, image_features=None, eps=None):
+        #return text_features, None, self.empty_pooled_clip.expand(text_features.shape[0], -1), torch.zeros(text_features.shape[1], 3).to(device=text_features.device, dtype=text_features.dtype), {'mu': torch.zeros([1,300,1], device=text_features.device, dtype=text_features.dtype), 'logvar': torch.zeros([1,300,1], device=text_features.device, dtype=text_features.dtype)}
+        dtype = text_features.dtype
+        device = text_features.device
+        if image_features is not None:
+            visual_hidden = self.proj(self.init_proj(image_features).view(len(image_features), 300, -1))
+            text_hidden = self.text_proj(text_features.detach())
+            hidden = visual_hidden - text_hidden
+            mu = self.proj_mu(hidden)
+            logvar = self.proj_logvar(hidden)
+            eps = mu + torch.exp(0.5 * logvar) * torch.randn_like(mu)
+        else:
+            if eps is None:
+                eps = self.test_eps.to(device=device, dtype=dtype)
+            mu = torch.zeros_like(eps)
+            logvar = torch.zeros_like(eps)
+        proj_eps = self.eps_proj(eps)
+        prompt_embeds = text_features + proj_eps
+        pooled_prompt_embeds = self.empty_pooled_clip.expand(text_features.shape[0], -1)
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        aux_info = {
+            'mu': mu,
+            'logvar': logvar,
+            'eps': eps
+        }
+        return prompt_embeds, None, pooled_prompt_embeds, text_ids, aux_info
+if __name__ == '__main__':
+    from transformers import AutoProcessor
+    from diffusers import FluxPipeline
+    import os
+    from PIL import Image
+    def create_image_grid(images, cols):
+        rows = (len(images) + cols - 1) // cols
+        w, h = images[0].size
+        grid = Image.new('RGB', (cols * w, rows * h))
+        for i, img in enumerate(images):
+            grid.paste(img, (i % cols * w, i // cols * h))
+        return grid
+    dim = 4096
+    num_heads = 32
+    dtype = torch.bfloat16
+    model = ConceptAligner().to('cuda').to(dtype)
+    x = torch.randn([5, 300, dim]).to('cuda').to(dtype)
+    y = torch.randn([5, 300, dim]).to('cuda').to(dtype)
+    i = torch.randn([5,768]).to('cuda').to(dtype)
+    y[1] = y[0]
+    m = torch.ones([5, 300]).to('cuda').to(dtype)
+    m[:3,:128] = 0
+    prompt_embeds, _, pooled_prompt_embeds, text_ids, aux_info = model(x, i)
+    print(prompt_embeds.shape, pooled_prompt_embeds.shape, text_ids.shape)
+    print(prompt_embeds.shape, ' ', pooled_prompt_embeds.shape, ' ', text_ids.shape)
+    for k in aux_info:
+        print(k, aux_info[k].shape, aux_info[k].min(), aux_info[k].max(), aux_info[k].mean())
+    from text_encoder import LoraT5Embedder
+    from datasets import load_dataset
+    dataset = load_dataset("facebook/emu_edit_test_set", split='validation[:200]')
+    item = dataset[0:4]
+    another_item = dataset[0:4]
+    from diffusers.models.normalization import RMSNorm
+    clip_processor = AutoProcessor.from_pretrained("./clip-vit-large-patch14")
+    clip_images = clip_processor(images=item['image'], return_tensors="pt").pixel_values.to('cuda:0').to(dtype)
+    texts = []
+    texts.append("""A heartwarming 3D rendered scene of
+    an elderly farmer and a tiny orange
+    kitten. The farmer, with a gentle smile,
+    walks alongside the kitten in a lush,
+    green garden filled with thriving plants,
+    showcasing a fruitful harvest. The
+    intricate details of the overalls and the
+    farmer's worn, weathered face tell a
+    story of years spent tending to the land. the farmer is wearing a blue shirt""")
+    texts.append("""A unique, intricately detailed creature
+    resembling a reptile, possibly a lizard or
+    a gecko. It has a vibrant blue and green
+    scaled body, with large, round, and
+    expressive eyes that are a deep shade of
+    blue. The backdrop is a
+    soft, blurred forest setting, suggesting a
+    serene and mystical ambiance. the creature is wearing a golden crown""")
+    texts.append("""Deep in the enchanted forest lives a woman
+    who is the moon fairy. Her long blonde hair
+    shines in the starlight, tangled with her flowers
+    that glow with a soft blue glow. Her eyes are
+    the color of the night and shine with the magic
+    of the night. The fairy wears a dress made of
+    moon petals, woven with threads of moonlight
+    that shine with an iridescent glow, a crown of
+    stars adorns her head, shining with the light of
+    the full moon that illuminates the forest. Her
+    wings are translucent like glass, with a pale
+    glow reminiscent of the glow of the moon. HD,
+    6K, photo, cinematic, poster""")
+    texts.append(
+        """In the image, a fluffy white cat sits peacefully on a windowsill surrounded by potted green plants. Sunlight filters through sheer white curtains, casting soft golden patterns across its fur. The window reveals a clear blue sky outside, with the silhouettes of trees swaying gently in the distance. The cat’s posture is calm and elegant, its tail curled neatly around its paws. The atmosphere is serene and homey, capturing a tranquil afternoon moment of quiet observation.""")
+    text_encoder = LoraT5Embedder(device='cuda').to(dtype)
+    text_features, _, _, _, image_features, _ = text_encoder(texts, clip_images)
+    print(text_features.shape, image_features.shape, ' >>>>>>>>> text input')
+    images = []
+    pipe = FluxPipeline.from_pretrained("./FLUX.1-dev", dtype=torch.bfloat16, text_encoder=None).to(torch.bfloat16)
+    pipe.to('cuda')
+    for txt_feat, img_feat in zip(text_features, image_features):
+        prompt_embeds, _, pooled_prompt_embeds, text_ids, aux_info = model(txt_feat.unsqueeze(0), img_feat.unsqueeze(0))
+        image = pipe(
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        height=512,
+        width=512,
+        guidance_scale=3.5,
+        num_inference_steps=20,
+        max_sequence_length=512,
+        generator=torch.Generator("cuda").manual_seed(1995),
+        ).images[0]
+        images.append(image)
+    aligned_image = create_image_grid(images, cols=len(images) // 2)
+    os.makedirs('samples', exist_ok=True)
+    aligned_image.save("samples/image%.jpg")
+    raise SystemExit
+    influence_matrix = aux_info['influence']
+    bin_influence_matrix = (influence_matrix > 0.1).float()
+    mean_alive = bin_influence_matrix.sum(dim=-1).mean()
+    max_alive = bin_influence_matrix.sum(dim=-1).max()
+    min_alive = bin_influence_matrix.sum(dim=-1).min()
+    max_token_alive = ((bin_influence_matrix.sum(dim=-1) > 0).float().sum(dim=-1)).max()
+    mean_token_alive = ((bin_influence_matrix.sum(dim=-1) > 0).float().sum(dim=-1)).mean()
+    min_token_alive = ((bin_influence_matrix.sum(dim=-1) > 0).float().sum(dim=-1)).min()
+    print(
+        f"Mean alive heads per token: {mean_alive:.2f}, Max alive heads per token: {max_alive:.2f}, Min alive heads per token: {min_alive:.2f}")
+    print(
+        f"Mean alive tokens: {mean_token_alive:.2f}, Max alive tokens: {max_token_alive:.2f}, Min alive tokens: {min_token_alive:.2f}")
+    import os
+    CHECKPOINT_PATH = 'runs/00393/checkpoint-6000'
+    from safetensors.torch import load_file
+    # Load adapter (model.safetensors)
+    adapter_path = os.path.join(CHECKPOINT_PATH, "model_1.safetensors")
+    if os.path.exists(adapter_path):
+        adapter_state = load_file(adapter_path)
+        model.load_state_dict(adapter_state, strict=True)
+        print("Adapter loaded successfully!")
+    print(model.influence_net.v_proj.weight, ' <<< weight ')
+    print(model.influence_net.v_proj.bias, ' <<< bias ')
+    print(model.influence_net.out_proj.weight, ' <<< out weight ')
+    print(model.influence_net.out_proj.bias, ' <<< out bias ')
+    print(model.influence_net.mask_mlp.linear3.weight, ' <<< gate weight 3 ')
+    print(model.influence_net.mask_mlp.linear3.bias, ' <<< gate bias ')
+    z = torch.randn([3, num_heads, 300, 4096 // num_heads]).to('cuda').to(dtype)
+    gate_values = model.influence_net.mask_mlp(z)
+    gate_values = 2 * (torch.sigmoid(gate_values))
+    print(gate_values, ' <<< gate values ', gate_values.shape, '  ', torch.mean(gate_values))
+    from diffusers import FluxPipeline
+    from PIL import Image
+    reserved_memory = torch.cuda.memory_reserved(0) / (1024 ** 3)
+    print(f"Reserved GPU memory: {reserved_memory:.2f} GB")
+    from transformers import T5EncoderModel, T5Tokenizer, CLIPTokenizer, CLIPTextModel
+    import torch
+    from text_encoder import LoraT5Embedder
+    text_encoder = LoraT5Embedder(device='cuda').to(torch.bfloat16)
+    texts = []
+    texts.append("""A heartwarming 3D rendered scene of
+an elderly farmer and a tiny orange
+kitten. The farmer, with a gentle smile,
+walks alongside the kitten in a lush,
+green garden filled with thriving plants,
+showcasing a fruitful harvest. The
+intricate details of the overalls and the
+farmer's worn, weathered face tell a
+story of years spent tending to the land. the farmer is wearing a blue shirt""")
+    texts.append("""A unique, intricately detailed creature
+resembling a reptile, possibly a lizard or
+a gecko. It has a vibrant blue and green
+scaled body, with large, round, and
+expressive eyes that are a deep shade of
+blue. The backdrop is a
+soft, blurred forest setting, suggesting a
+serene and mystical ambiance. the creature is wearing a golden crown""")
+    texts.append("""Deep in the enchanted forest lives a woman
+who is the moon fairy. Her long blonde hair
+shines in the starlight, tangled with her flowers
+that glow with a soft blue glow. Her eyes are
+the color of the night and shine with the magic
+of the night. The fairy wears a dress made of
+moon petals, woven with threads of moonlight
+that shine with an iridescent glow, a crown of
+stars adorns her head, shining with the light of
+the full moon that illuminates the forest. Her
+wings are translucent like glass, with a pale
+glow reminiscent of the glow of the moon. HD,
+6K, photo, cinematic, poster""")
+    texts.append(
+        """In the image, a fluffy white cat sits peacefully on a windowsill surrounded by potted green plants. Sunlight filters through sheer white curtains, casting soft golden patterns across its fur. The window reveals a clear blue sky outside, with the silhouettes of trees swaying gently in the distance. The cat’s posture is calm and elegant, its tail curled neatly around its paws. The atmosphere is serene and homey, capturing a tranquil afternoon moment of quiet observation.""")
+    texts.append(
+        """In the image, a majestic white horse gallops across a misty meadow at sunrise. Its mane and tail flow freely in the golden light, and the air glows softly with early morning haze. The horse’s body is bare, revealing the natural curve of its muscles and the sheen of its coat. Dew sparkles on the grass beneath its hooves, and the distant trees fade into pale gold mist. The scene conveys freedom, grace, and quiet power.""")
+    INDEX = 0
+    text = texts[INDEX]
+    with torch.no_grad():
+        floral_embeds, _,_,_,_,attn_mask = text_encoder(text, )
+        print(attn_mask.shape, '   >>>> ', attn_mask)
+        print(floral_embeds.shape, shared_embeds.shape, ' >>>> floral ')
+        nopad_floral_embeds, nopad_shared_embeds, nopad_attn_mask = text_encoder(text, padding=False)
+        print(floral_embeds.shape, shared_embeds.shape, ' >>>> floral ')
+        """
+        _,_,_,_,aux_info = model(floral_embeds, shared_embeds, attn_mask, is_training=False)
+        print(aux_info['meaningful_influence'].shape, ' <<< influence shape ', aux_info['meaningful_influence'][:100],'   ',torch.mean(aux_info['meaningful_influence']))
+        floral_embeds, shared_embeds, attn_mask = text_encoder([""], padding='max_length')
+        _,_,_,_,aux_info = model(floral_embeds, shared_embeds, attn_mask, is_training=False)
+        print(aux_info['meaningful_influence'].shape, ' <<< empty influence shape ', aux_info['meaningful_influence'],'     ',torch.mean(aux_info['meaningful_influence']))
+        raise SystemExit
+        """
+    text2s = []
+    text2s.append("""A heartwarming 3D rendered scene of
+    an elderly farmer and a tiny orange
+    kitten. The farmer, with a gentle smile,
+    walks alongside the kitten in a lush,
+    green garden filled with thriving plants,
+    showcasing a fruitful harvest. The
+    intricate details of the overalls and the
+    farmer's worn, weathered face tell a
+    story of years spent tending to the land. the farmer is wearing a red shirt""")
+    text2s.append("""A unique, intricately detailed creature
+    resembling a reptile, possibly a lizard or
+    a gecko. It has a vibrant blue and green
+    scaled body, with large, round, and
+    expressive eyes that are a deep shade of
+    blue. The backdrop is a
+    soft, blurred forest setting, suggesting a
+    serene and mystical ambiance. the creature is wearing a floral crown""")
+    text2s.append("""Deep in the enchanted forest lives a woman
+who is the moon fairy. Her long black hair
+shines in the starlight, tangled with her flowers
+that glow with a soft blue glow. Her eyes are
+the color of the night and shine with the magic
+of the night. The fairy wears a dress made of
+moon petals, woven with threads of moonlight
+that shine with an iridescent glow, a crown of
+stars adorns her head, shining with the light of
+the full moon that illuminates the forest. Her
+wings are translucent like glass, with a pale
+glow reminiscent of the glow of the moon. HD,
+6K, photo, cinematic, poster""")
+    text2s.append(
+        """In the image, a fluffy white cat sits peacefully on a windowsill surrounded by potted green plants. Sunlight filters through sheer white curtains, casting soft golden patterns across its fur. The window reveals a gray, rainy sky outside, with raindrops streaking down the glass and blurred trees beyond. The cat’s posture is calm and elegant, its tail curled neatly around its paws. The atmosphere is serene and introspective, capturing a cozy moment of quiet observation during a rainy afternoon.""")
+    text2s.append(
+        """In the image, a majestic white horse gallops across a misty meadow at sunrise. Its mane and tail flow freely in the golden light, and the air glows softly with early morning haze. The horse’s body is adorned with a bright red saddle, contrasting sharply against its white coat. Dew sparkles on the grass beneath its hooves, and the distant trees fade into pale gold mist. The scene conveys freedom, grace, and a striking touch of color that adds visual drama.""")
+    text2 = text2s[INDEX]
+    with torch.no_grad():
+        golden_embeds, shared_embeds, golden_mask = text_encoder(text2, padding='max_length')
+        print(golden_embeds.shape, shared_embeds.shape, ' >>>> golden ')
+        nopad_golden_embeds, nopad_shared_embeds, nopad_golden_mask = text_encoder(text2, padding=False)
+        print(golden_embeds.shape, shared_embeds.shape, ' >>>> golden ')
+    batch_encoding = text_encoder.t5_tokenizer(
+        text,
+        truncation=True,
+        max_length=text_encoder.max_length,
+        return_tensors="pt",
+    )
+    input_ids = batch_encoding["input_ids"][0]  # Get the token IDs
+    # Convert token IDs back to tokens to see what they are
+    tokens_floral = text_encoder.t5_tokenizer.convert_ids_to_tokens(input_ids)
+    batch_encoding = text_encoder.t5_tokenizer(
+        text2,
+        truncation=True,
+        max_length=text_encoder.max_length,
+        return_tensors="pt",
+    )
+    input_ids = batch_encoding["input_ids"][0]  # Get the token IDs
+    tokens_golden = text_encoder.t5_tokenizer.convert_ids_to_tokens(input_ids)
+    # Convert token IDs back to tokens to see what they are
+    # Find the index of specific words
+    def find_token_indices(tokens, word):
+        """Find all indices where a word or its token appears"""
+        indices = []
+        # T5 tokenizer might split words or add special characters
+        word_token = text_encoder.t5_tokenizer.encode(word, add_special_tokens=False)[0]
+        word_token_str = text_encoder.t5_tokenizer.convert_ids_to_tokens([word_token])[0]
+        for i, token in enumerate(tokens):
+            if token == word_token_str or word.lower() in token.lower():
+                indices.append(i)
+        return indices
+    key1s = ['blue', 'golden', 'blonde', 'clear', 'horse']
+    key2s = ['red', 'floral', 'black', 'rainy', 'red']
+    # Find indices for "blue"
+    blue_indices = find_token_indices(tokens_floral, key1s[INDEX])[-1]
+    print(f"Indices for 'blue': {blue_indices}")
+    # Find indices for "red" (won't be found in this text)
+    red_indices = find_token_indices(tokens_golden, key2s[INDEX])[-1]
+    print(f"Indices for 'red': {red_indices}")
+    pipe = FluxPipeline.from_pretrained("./FLUX.1-dev", dtype=torch.bfloat16, text_encoder=None).to(torch.bfloat16)
+    pipe.to('cuda')
+    adapter_path = os.path.join(CHECKPOINT_PATH, "model.safetensors")
+    if os.path.exists(adapter_path):
+        adapter_state = load_file(adapter_path)
+        pipe.transformer.load_state_dict(adapter_state, strict=True)
+        print("Transformer loaded successfully!")
+    images = []
+    empty_pooled_clip = torch.load('empty_pooled_clip.pt', map_location='cpu').to('cuda').to(torch.bfloat16)
+    print("Generating image with concatenation...")
+    images = []
+    # for cur_prompt_embed  in [floral_embeds, nopad_floral_embeds
+    #                          , inter_embed, golden_embeds, nopad_golden_embeds]:
+    # for (start_dim, end_dim) in [(0,4096), (1024,4096), (2048, 4096), (1024, 2048)]:
+    for emb in ['floral', 'golden']:
+        for temp in [2.5]:
+            for thr in [-1, 0.5, 0.75, 0.85, 0.95]:
+                for topk in [None]:
+                    print('>>>> Temperature: ', temp, topk)
+                    if 'floral' in emb:
+                        inter_embed, _, _, _, new_aux_info = model(floral_embeds, shared_embeds, attn_mask,
+                                                                   is_training=False, temperature=temp,
+                                                                   threshold=thr, topk=topk)
+                    else:
+                        inter_embed, _, _, _, new_aux_info = model(golden_embeds, shared_embeds, golden_mask,
+                                                                   is_training=False, temperature=temp,
+                                                                   threshold=thr, topk=topk)
+                    print(new_aux_info['influence'][:, blue_indices].shape, ' >>>> influence ',
+                          new_aux_info['influence'][:, blue_indices])
+                    print(new_aux_info['meaningful_influence'], ' >>>> meaningful influence ',
+                          torch.mean(new_aux_info['meaningful_influence']))
+                    # inter_embed = torch.clone(floral_embeds)
+                    # inter_embed[:, blue_indices] = shared_embeds[:, blue_indices]
+                    # inter_embed[:, blue_indices, start_dim:end_dim] = floral_embeds[:, blue_indices, start_dim:end_dim]
+                    image = pipe(
+                        prompt_embeds=inter_embed,
+                        pooled_prompt_embeds=empty_pooled_clip,
+                        height=512,
+                        width=512,
+                        guidance_scale=3.5,
+                        num_inference_steps=20,
+                        max_sequence_length=512,
+                        generator=torch.Generator("cuda").manual_seed(1995),
+                    ).images[0]
+                    images.append(image)
+    aligned_image = create_image_grid(images, cols=len(images) // 2)
+    os.makedirs('samples', exist_ok=True)
+    aligned_image.save("samples/image%s.jpg" % INDEX)

app.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import torch
+import os
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from aligner import ConceptAligner
+from text_encoder import LoraT5Embedder
+from pipeline import CustomFluxKontextPipeline
+from diffusers import FluxTransformer2DModel, FlowMatchEulerDiscreteScheduler, AutoencoderKL
+from peft import LoraConfig
+import gradio as gr
+# Configuration
+MODEL_REPO = "Shaoan/ConceptAligner-Weights"  # Your model repo
+CHECKPOINT_DIR = "./checkpoint"
+def download_checkpoint():
+    """Download checkpoint files from HF model repo"""
+    print("Downloading checkpoint files...")
+    files = [
+        "model.safetensors",
+        "model_1.safetensors",
+        "model_2.safetensors"
+    ]
+    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+    for filename in files:
+        local_path = os.path.join(CHECKPOINT_DIR, filename)
+        if not os.path.exists(local_path):
+            print(f"  Downloading {filename}...")
+            hf_hub_download(
+                repo_id=MODEL_REPO,
+                filename=filename,
+                local_dir=CHECKPOINT_DIR,
+                local_dir_use_symlinks=False
+            )
+            print(f"  ✓ {filename} downloaded")
+    print("✓ All checkpoint files ready!")
+class ConceptAlignerModel:
+    def __init__(self):
+        # Download checkpoint first
+        download_checkpoint()
+        self.checkpoint_path = CHECKPOINT_DIR
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+        self.previous_image = None
+        self.previous_prompt = None
+        print(f"\n{'='*60}")
+        print(f"Loading ConceptAligner Model")
+        print(f"Device: {self.device}")
+        print(f"{'='*60}")
+        self.setup_models()
+    def setup_models(self):
+        """Load all models"""
+        # Load ConceptAligner
+        print(f"  Loading ConceptAligner...")
+        self.model = ConceptAligner().to(self.device).to(self.dtype)
+        adapter_path = os.path.join(self.checkpoint_path, "model_1.safetensors")
+        adapter_state = load_file(adapter_path)
+        self.model.load_state_dict(adapter_state, strict=True)
+        print(f"  ✓ Adapter loaded")
+        # Load T5 encoder
+        print(f"  Loading T5 encoder...")
+        self.text_encoder = LoraT5Embedder(device=self.device).to(self.dtype)
+        adapter_path = os.path.join(self.checkpoint_path, "model_2.safetensors")
+        adapter_state = load_file(adapter_path)
+        if "t5_encoder.shared.weight" in adapter_state and "t5_encoder.encoder.embed_tokens.weight" not in adapter_state:
+            adapter_state["t5_encoder.encoder.embed_tokens.weight"] = adapter_state["t5_encoder.shared.weight"]
+        self.text_encoder.load_state_dict(adapter_state, strict=True)
+        print(f"  ✓ T5 Adapter loaded")
+        # Load VAE
+        print(f"  Loading VAE...")
+        vae = AutoencoderKL.from_pretrained(
+            'black-forest-labs/FLUX.1-dev',
+            subfolder="vae",
+            torch_dtype=self.dtype
+        ).to(self.device)
+        # Load transformer
+        print(f"  Loading transformer...")
+        transformer = FluxTransformer2DModel.from_pretrained(
+            'black-forest-labs/FLUX.1-dev',
+            subfolder="transformer",
+            torch_dtype=self.dtype
+        )
+        target_modules = [
+            "attn.to_k", "attn.to_q", "attn.to_v", "attn.to_out.0",
+            "attn.add_k_proj", "attn.add_q_proj", "attn.add_v_proj", "attn.to_add_out",
+            "ff.net.0.proj", "ff.net.2", "ff_context.net.0.proj", "ff_context.net.2",
+            "proj_mlp", "proj_out", "norm.linear", "norm1.linear"
+        ]
+        transformer_lora_config = LoraConfig(
+            r=256,
+            lora_alpha=256,
+            lora_dropout=0.0,
+            init_lora_weights="gaussian",
+            target_modules=target_modules,
+        )
+        transformer.add_adapter(transformer_lora_config)
+        transformer.context_embedder.requires_grad_(True)
+        # Load fine-tuned transformer
+        transformer_path = os.path.join(self.checkpoint_path, "model.safetensors")
+        transformer_state = load_file(transformer_path)
+        transformer.load_state_dict(transformer_state, strict=True)
+        print(f"  ✓ Fine-tuned transformer loaded")
+        transformer = transformer.to(self.device)
+        # Load or download empty pooled clip
+        empty_clip_path = "empty_pooled_clip.pt"
+        if not os.path.exists(empty_clip_path):
+            print("  Downloading empty_pooled_clip.pt...")
+            hf_hub_download(
+                repo_id=MODEL_REPO,
+                filename="empty_pooled_clip.pt",
+                local_dir=".",
+                local_dir_use_symlinks=False
+            )
+        self.empty_pooled_clip = torch.load(empty_clip_path, map_location=self.device).to(self.dtype)
+        # Create pipeline
+        noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            'black-forest-labs/FLUX.1-dev', subfolder="scheduler"
+        )
+        self.pipe = CustomFluxKontextPipeline(
+            scheduler=noise_scheduler,
+            aligner=self.model.to(self.device).to(self.dtype),
+            transformer=transformer.to(self.device).to(self.dtype),
+            vae=vae.to(self.device).to(self.dtype),
+            text_embedder=self.text_encoder.to(self.device).to(self.dtype),
+        ).to(self.device)
+        if torch.cuda.is_available():
+            allocated = torch.cuda.memory_allocated(0) / 1024**3
+            reserved = torch.cuda.memory_reserved(0) / 1024**3
+            print(f"  ✓ Pipeline ready on {self.device}")
+            print(f"  📊 GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")
+        else:
+            print(f"  ✓ Pipeline ready on {self.device}")
+    @torch.no_grad()
+    def generate_image(
+        self,
+        prompt,
+        threshold=0.0,
+        topk=0,
+        height=512,
+        width=512,
+        guidance_scale=3.5,
+        true_cf_scale=1.0,
+        num_inference_steps=20,
+        seed=1995
+    ):
+        """Generate image and return previous + current for comparison"""
+        if not prompt.strip():
+            return self.previous_image, None, self.previous_prompt or ""
+        try:
+            generator = torch.Generator(device=self.device).manual_seed(int(seed))
+            current_image = self.pipe(
+                prompt=prompt,
+                guidance_scale=guidance_scale,
+                true_cfg_scale=true_cf_scale,
+                max_sequence_length=512,
+                num_inference_steps=num_inference_steps,
+                height=height,
+                width=width,
+                generator=generator,
+            ).images[0]
+            prev_image = self.previous_image
+            prev_prompt = self.previous_prompt or "No previous generation"
+            self.previous_image = current_image
+            self.previous_prompt = prompt
+            return prev_image, current_image, prev_prompt
+        except Exception as e:
+            import traceback
+            error_msg = f"❌ Error: {str(e)}\n{traceback.format_exc()}"
+            print(error_msg)
+            return self.previous_image, None, self.previous_prompt or ""
+    def reset_history(self):
+        """Clear generation history"""
+        self.previous_image = None
+        self.previous_prompt = None
+        return None, None, "No previous generation"
+# Initialize model
+print("Initializing ConceptAligner model...")
+model = ConceptAlignerModel()

empty_pooled_clip.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92acbe688a00c835deb9b645fe673e16af2ceef9cd749a8b838e67dea23d76b2
+size 3183

pipeline.py ADDED Viewed

	@@ -0,0 +1,641 @@

+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, FluxTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+from diffusers import FluxKontextPipeline
+PREFERRED_KONTEXT_RESOLUTIONS = [
+    (672, 1568),
+    (688, 1504),
+    (720, 1456),
+    (752, 1392),
+    (800, 1328),
+    (832, 1248),
+    (880, 1184),
+    (944, 1104),
+    (1024, 1024),
+    (1104, 944),
+    (1184, 880),
+    (1248, 832),
+    (1328, 800),
+    (1392, 752),
+    (1456, 720),
+    (1504, 688),
+    (1568, 672),
+]
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+from diffusers import FluxKontextPipeline
+from typing import Union, List, Optional
+import torch
+class CustomFluxKontextPipeline(FluxKontextPipeline):
+	r"""
+	Custom Flux Kontext pipeline with a wrapper text embedder.
+	"""
+	model_cpu_offload_seq = "text_embedder->image_encoder->transformer->vae"
+	def __init__(
+			self,
+			scheduler,
+			vae,
+			text_embedder,  # Your custom text embedder wrapper
+			transformer,
+			aligner,
+			image_encoder=None,
+			feature_extractor=None,
+	):
+		# Don't call super().__init__() since parent expects text_encoder parameters
+		# Instead, manually register modules
+		from diffusers import DiffusionPipeline
+		DiffusionPipeline.__init__(self)
+		self.register_modules(
+			vae=vae,
+			text_embedder=text_embedder,
+			transformer=transformer,
+			scheduler=scheduler,
+			aligner=aligner,
+			image_encoder=image_encoder,
+			feature_extractor=feature_extractor,
+		)
+		# Initialize the necessary attributes from parent
+		self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+		self.latent_channels = self.vae.config.latent_channels
+		from diffusers.image_processor import VaeImageProcessor
+		self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+		self.default_sample_size = 128
+	def encode_prompt(
+			self,
+			prompt: Union[str, List[str]],
+			prompt_2: Optional[Union[str, List[str]]] = None,
+			device: Optional[torch.device] = None,
+			num_images_per_prompt: int = 1,
+			prompt_embeds: Optional[torch.FloatTensor] = None,
+			pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+			max_sequence_length: int = 512,
+			lora_scale: Optional[float] = None,
+			temperature=None,
+			threshold=None,
+	):
+		device = device or self._execution_device
+		if prompt_embeds is None:
+			# Use your custom text embedder
+			qwen_embeds, clip_image_embeds, perturbed_qwen_embeds, replace_ids, t5_tokenizer, batch_encoding = self.text_embedder(prompt)
+			prompt_embeds, prompt_attention_mask, pooled_prompt_embeds, text_ids, _ = self.aligner(qwen_embeds,
+			                                                                                       )
+			prompt_embeds = prompt_embeds.to(device=device)
+			pooled_prompt_embeds = pooled_prompt_embeds.to(device=device)
+			text_ids = text_ids.to(device=device)
+		else:
+			# When embeddings are provided, create text_ids
+			dtype = self.transformer.dtype
+			text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+		# Duplicate for num_images_per_prompt
+		if num_images_per_prompt > 1:
+			prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+			pooled_prompt_embeds = pooled_prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+			text_ids = text_ids.repeat(num_images_per_prompt, 1)
+		return prompt_embeds, prompt_attention_mask, pooled_prompt_embeds, text_ids
+	@torch.no_grad()
+	def __call__(
+			self,
+			image: Optional[PipelineImageInput] = None,
+			prompt: Union[str, List[str]] = None,
+			prompt_2: Optional[Union[str, List[str]]] = None,
+			negative_prompt: Union[str, List[str]] = "",
+			negative_prompt_2: Optional[Union[str, List[str]]] = None,
+			true_cfg_scale: float = 1.0,
+			height: Optional[int] = None,
+			width: Optional[int] = None,
+			num_inference_steps: int = 28,
+			sigmas: Optional[List[float]] = None,
+			guidance_scale: float = 3.5,
+			num_images_per_prompt: Optional[int] = 1,
+			generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+			latents: Optional[torch.FloatTensor] = None,
+			prompt_embeds: Optional[torch.FloatTensor] = None,
+			pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+			ip_adapter_image: Optional[PipelineImageInput] = None,
+			ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+			negative_ip_adapter_image: Optional[PipelineImageInput] = None,
+			negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+			negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+			negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+			output_type: Optional[str] = "pil",
+			return_dict: bool = True,
+			joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+			callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+			callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+			max_sequence_length: int = 512,
+			max_area: int = 1024 ** 2,
+			_auto_resize: bool = True,
+			temperature=None,
+			threshold=None,
+	):
+		r"""
+		Function invoked when calling the pipeline for generation.
+		Args:
+			image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+				`Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+				numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+				or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+				list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+				latents as `image`, but if passing latents directly it is not encoded again.
+			prompt (`str` or `List[str]`, *optional*):
+				The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+				instead.
+			prompt_2 (`str` or `List[str]`, *optional*):
+				The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+				will be used instead.
+			negative_prompt (`str` or `List[str]`, *optional*):
+				The prompt or prompts not to guide the image generation. If not defined, one has to pass
+				`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
+				not greater than `1`).
+			negative_prompt_2 (`str` or `List[str]`, *optional*):
+				The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+				`text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+			true_cfg_scale (`float`, *optional*, defaults to 1.0):
+				When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
+			height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+				The height in pixels of the generated image. This is set to 1024 by default for the best results.
+			width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+				The width in pixels of the generated image. This is set to 1024 by default for the best results.
+			num_inference_steps (`int`, *optional*, defaults to 50):
+				The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+				expense of slower inference.
+			sigmas (`List[float]`, *optional*):
+				Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+				their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+				will be used.
+			guidance_scale (`float`, *optional*, defaults to 3.5):
+				Embedded guidance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages
+				a model to generate images more aligned with prompt at the expense of lower image quality.
+				Guidance-distilled models approximates true classifier-free guidance for `guidance_scale` > 1. Refer to
+				the [paper](https://huggingface.co/papers/2210.03142) to learn more.
+			num_images_per_prompt (`int`, *optional*, defaults to 1):
+				The number of images to generate per prompt.
+			generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+				One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+				to make generation deterministic.
+			latents (`torch.FloatTensor`, *optional*):
+				Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+				generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+				tensor will be generated by sampling using the supplied random `generator`.
+			prompt_embeds (`torch.FloatTensor`, *optional*):
+				Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+				provided, text embeddings will be generated from `prompt` input argument.
+			pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+				Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+				If not provided, pooled text embeddings will be generated from `prompt` input argument.
+			ip_adapter_image: (`PipelineImageInput`, *optional*):
+				Optional image input to work with IP Adapters.
+			ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+				Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+				IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+				provided, embeddings are computed from the `ip_adapter_image` input argument.
+			negative_ip_adapter_image:
+				(`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+			negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+				Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+				IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+				provided, embeddings are computed from the `ip_adapter_image` input argument.
+			negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+				Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+				weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+				argument.
+			negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+				Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+				weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+				input argument.
+			output_type (`str`, *optional*, defaults to `"pil"`):
+				The output format of the generate image. Choose between
+				[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+			return_dict (`bool`, *optional*, defaults to `True`):
+				Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+			joint_attention_kwargs (`dict`, *optional*):
+				A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+				`self.processor` in
+				[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+			callback_on_step_end (`Callable`, *optional*):
+				A function that calls at the end of each denoising steps during the inference. The function is called
+				with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+				callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+				`callback_on_step_end_tensor_inputs`.
+			callback_on_step_end_tensor_inputs (`List`, *optional*):
+				The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+				will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+				`._callback_tensor_inputs` attribute of your pipeline class.
+			max_sequence_length (`int` defaults to 512):
+				Maximum sequence length to use with the `prompt`.
+			max_area (`int`, defaults to `1024 ** 2`):
+				The maximum area of the generated image in pixels. The height and width will be adjusted to fit this
+				area while maintaining the aspect ratio.
+		Examples:
+		Returns:
+			[`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+			is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+			images.
+		"""
+		height = height or self.default_sample_size * self.vae_scale_factor
+		width = width or self.default_sample_size * self.vae_scale_factor
+		original_height, original_width = height, width
+		aspect_ratio = width / height
+		"""
+		width = round((max_area * aspect_ratio) ** 0.5)
+		height = round((max_area / aspect_ratio) ** 0.5)
+		multiple_of = self.vae_scale_factor * 2
+		width = width // multiple_of * multiple_of
+		height = height // multiple_of * multiple_of
+		if height != original_height or width != original_width:
+			print(
+				f"Generation `height` and `width` have been adjusted to {height} and {width} to fit the model requirements."
+			)
+		"""
+		# 1. Check inputs. Raise error if not correct
+		self.check_inputs(
+			prompt,
+			prompt_2,
+			height,
+			width,
+			negative_prompt=negative_prompt,
+			negative_prompt_2=negative_prompt_2,
+			prompt_embeds=prompt_embeds,
+			negative_prompt_embeds=negative_prompt_embeds,
+			pooled_prompt_embeds=pooled_prompt_embeds,
+			negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+			callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+			max_sequence_length=max_sequence_length,
+		)
+		self._guidance_scale = guidance_scale
+		self._joint_attention_kwargs = joint_attention_kwargs
+		self._current_timestep = None
+		self._interrupt = False
+		# 2. Define call parameters
+		if prompt is not None and isinstance(prompt, str):
+			batch_size = 1
+		elif prompt is not None and isinstance(prompt, list):
+			batch_size = len(prompt)
+		else:
+			batch_size = prompt_embeds.shape[0]
+		device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+		lora_scale = (
+			self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+		)
+		has_neg_prompt = negative_prompt is not None or (
+				negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
+		)
+		do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
+		(
+			prompt_embeds,
+			prompt_attention_mask,
+			pooled_prompt_embeds,
+			text_ids,
+		) = self.encode_prompt(
+			prompt=prompt,
+			prompt_2=prompt_2,
+			prompt_embeds=prompt_embeds,
+			pooled_prompt_embeds=pooled_prompt_embeds,
+			device=device,
+			num_images_per_prompt=num_images_per_prompt,
+			max_sequence_length=max_sequence_length,
+			lora_scale=lora_scale,
+			temperature=temperature,
+			threshold=threshold,
+		)
+		(
+				negative_prompt_embeds,
+				negative_prompt_attention_mask,
+				negative_pooled_prompt_embeds,
+				negative_text_ids,
+			) = self.encode_prompt(
+				prompt=negative_prompt,
+				prompt_2=negative_prompt_2,
+				prompt_embeds=negative_prompt_embeds,
+				pooled_prompt_embeds=negative_pooled_prompt_embeds,
+				device=device,
+				num_images_per_prompt=num_images_per_prompt,
+				max_sequence_length=max_sequence_length,
+				lora_scale=lora_scale,
+				temperature=temperature,
+				threshold=threshold,
+			)
+		pooled_prompt_embeds = negative_pooled_prompt_embeds
+		# 3. Preprocess image
+		if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
+			img = image[0] if isinstance(image, list) else image
+			"""
+			image_height, image_width = self.image_processor.get_default_height_width(img)
+			aspect_ratio = image_width / image_height
+			if _auto_resize:
+				# Kontext is trained on specific resolutions, using one of them is recommended
+				_, image_width, image_height = min(
+					(abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
+				)
+			image_width = image_width // multiple_of * multiple_of
+			image_height = image_height // multiple_of * multiple_of
+			"""
+			image_height, image_width = original_height, original_width
+			image = self.image_processor.resize(image, image_height, image_width)
+			image = self.image_processor.preprocess(image, image_height, image_width)
+		# 4. Prepare latent variables
+		num_channels_latents = self.transformer.config.in_channels // 4
+		latents, image_latents, latent_ids, image_ids = self.prepare_latents(
+			image,
+			batch_size * num_images_per_prompt,
+			num_channels_latents,
+			height,
+			width,
+			prompt_embeds.dtype,
+			device,
+			generator,
+			latents,
+		)
+		if image_ids is not None:
+			latent_ids = torch.cat([latent_ids, image_ids], dim=0)  # dim 0 is sequence dimension
+		# 5. Prepare timesteps
+		sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+		image_seq_len = latents.shape[1]
+		mu = calculate_shift(
+			image_seq_len,
+			self.scheduler.config.get("base_image_seq_len", 256),
+			self.scheduler.config.get("max_image_seq_len", 4096),
+			self.scheduler.config.get("base_shift", 0.5),
+			self.scheduler.config.get("max_shift", 1.15),
+		)
+		timesteps, num_inference_steps = retrieve_timesteps(
+			self.scheduler,
+			num_inference_steps,
+			device,
+			sigmas=sigmas,
+			mu=mu,
+		)
+		num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+		self._num_timesteps = len(timesteps)
+		# handle guidance
+		if self.transformer.config.guidance_embeds:
+			guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+			guidance = guidance.expand(latents.shape[0])
+		else:
+			guidance = None
+		if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
+				negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
+		):
+			negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+			negative_ip_adapter_image = [negative_ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
+		elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
+				negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
+		):
+			ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+			ip_adapter_image = [ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
+		if self.joint_attention_kwargs is None:
+			self._joint_attention_kwargs = {}
+		image_embeds = None
+		negative_image_embeds = None
+		if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+			image_embeds = self.prepare_ip_adapter_image_embeds(
+				ip_adapter_image,
+				ip_adapter_image_embeds,
+				device,
+				batch_size * num_images_per_prompt,
+			)
+		if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
+			negative_image_embeds = self.prepare_ip_adapter_image_embeds(
+				negative_ip_adapter_image,
+				negative_ip_adapter_image_embeds,
+				device,
+				batch_size * num_images_per_prompt,
+			)
+		# 6. Denoising loop
+		# We set the index here to remove DtoH sync, helpful especially during compilation.
+		# Check out more details here: https://github.com/huggingface/diffusers/pull/11696
+		self.scheduler.set_begin_index(0)
+		with self.progress_bar(total=num_inference_steps) as progress_bar:
+			for i, t in enumerate(timesteps):
+				if self.interrupt:
+					continue
+				self._current_timestep = t
+				if image_embeds is not None:
+					self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
+				latent_model_input = latents
+				if image_latents is not None:
+					latent_model_input = torch.cat([latents, image_latents], dim=1)
+				timestep = t.expand(latents.shape[0]).to(latents.dtype)
+				noise_pred = self.transformer(
+					hidden_states=latent_model_input,
+					timestep=timestep / 1000,
+					guidance=guidance,
+					pooled_projections=pooled_prompt_embeds,
+					encoder_hidden_states=prompt_embeds,
+					txt_ids=text_ids,
+					img_ids=latent_ids,
+					joint_attention_kwargs={'attention_mask': prompt_attention_mask},
+					return_dict=False,
+				)[0]
+				noise_pred = noise_pred[:, : latents.size(1)]
+				if do_true_cfg:
+					if negative_image_embeds is not None:
+						self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
+					neg_noise_pred = self.transformer(
+						hidden_states=latent_model_input,
+						timestep=timestep / 1000,
+						guidance=guidance,
+						pooled_projections=negative_pooled_prompt_embeds,
+						encoder_hidden_states=negative_prompt_embeds,
+						txt_ids=negative_text_ids,
+						img_ids=latent_ids,
+						joint_attention_kwargs={'attention_mask': negative_prompt_attention_mask},
+						return_dict=False,
+					)[0]
+					neg_noise_pred = neg_noise_pred[:, : latents.size(1)]
+					noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+				# compute the previous noisy sample x_t -> x_t-1
+				latents_dtype = latents.dtype
+				latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+				if latents.dtype != latents_dtype:
+					if torch.backends.mps.is_available():
+						# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+						latents = latents.to(latents_dtype)
+				if callback_on_step_end is not None:
+					callback_kwargs = {}
+					for k in callback_on_step_end_tensor_inputs:
+						callback_kwargs[k] = locals()[k]
+					callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+					latents = callback_outputs.pop("latents", latents)
+					prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+				# call the callback, if provided
+				if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+					progress_bar.update()
+		self._current_timestep = None
+		if output_type == "latent":
+			image = latents
+		else:
+			latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+			latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+			dtype = torch.bfloat16
+			image = self.vae.decode(latents.to(dtype), return_dict=False)[0]
+			image = self.image_processor.postprocess(image, output_type=output_type)
+		# Offload all models
+		self.maybe_free_model_hooks()
+		if not return_dict:
+			return (image,)
+		return FluxPipelineOutput(images=image)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch>=2.0.0
+gradio>=4.0.0
+diffusers>=0.27.0
+transformers>=4.38.0
+safetensors>=0.4.0
+accelerate>=0.26.0
+peft>=0.8.0
+Pillow>=10.0.0

requirements.txt.py ADDED Viewed

	@@ -0,0 +1,8 @@

+torch>=2.0.0
+gradio>=4.0.0
+diffusers>=0.27.0
+transformers>=4.38.0
+safetensors>=0.4.0
+accelerate>=0.26.0
+peft>=0.8.0
+Pillow>=10.0.0

text_encoder.py ADDED Viewed

	@@ -0,0 +1,1188 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+import torch.nn as nn
+def tokenize_prompt(tokenizer, prompt, max_sequence_length):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def _encode_prompt_with_t5(
+        text_encoder,
+        tokenizer,
+        max_sequence_length=512,
+        prompt=None,
+        num_images_per_prompt=1,
+        device=None,
+        text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    if hasattr(text_encoder, "module"):
+        dtype = text_encoder.module.dtype
+    else:
+        dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds
+def _encode_prompt_with_clip(
+        text_encoder,
+        tokenizer,
+        prompt: str,
+        device=None,
+        text_input_ids=None,
+        num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
+    if hasattr(text_encoder, "module"):
+        dtype = text_encoder.module.dtype
+    else:
+        dtype = text_encoder.dtype
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+    return prompt_embeds
+def encode_prompt(
+        text_encoders,
+        tokenizers,
+        prompt: str,
+        max_sequence_length,
+        device=None,
+        num_images_per_prompt: int = 1,
+        text_input_ids_list=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    if hasattr(text_encoders[0], "module"):
+        dtype = text_encoders[0].module.dtype
+    else:
+        dtype = text_encoders[0].dtype
+    pooled_prompt_embeds = _encode_prompt_with_clip(
+        text_encoder=text_encoders[0],
+        tokenizer=tokenizers[0],
+        prompt=prompt,
+        device=device if device is not None else text_encoders[0].device,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
+    )
+    prompt_embeds = _encode_prompt_with_t5(
+        text_encoder=text_encoders[1],
+        tokenizer=tokenizers[1],
+        max_sequence_length=max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        device=device if device is not None else text_encoders[1].device,
+        text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
+    )
+    text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+    return prompt_embeds, pooled_prompt_embeds, text_ids
+from transformers import T5EncoderModel, T5Tokenizer, CLIPTokenizer, CLIPTextModel
+class T5Embedder(torch.nn.Module):
+    def __init__(self, device, max_length=300):
+        super().__init__()
+        self.device = device
+        self.max_length = max_length
+        dtype = torch.bfloat16
+        self.dtype = dtype
+        t5_version = './t5-v1_1-xxl'
+        self.t5_tokenizer = T5Tokenizer.from_pretrained(t5_version, max_length=max_length)
+        self.t5_encoder = T5EncoderModel.from_pretrained(t5_version, torch_dtype=dtype).to(device=device)
+        self.t5_encoder = self.t5_encoder.eval().requires_grad_(False)
+        self.num_shared = max_length
+    @torch.no_grad()
+    def forward(self, text):
+        if isinstance(text, str):
+            text = [text]
+        batch_encoding = self.t5_tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        prompt_embeds = self.t5_encoder(
+            input_ids=batch_encoding["input_ids"].to(self.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )['last_hidden_state']
+        prompt_attention_mask = batch_encoding['attention_mask'].to(self.device)
+        new_text = [x.split('.')[0] for x in text]
+        batch_encoding = self.t5_tokenizer(
+            new_text,
+            truncation=True,
+            max_length=self.num_shared,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        shared_prompt_embeds = self.t5_encoder(
+            input_ids=batch_encoding["input_ids"].to(self.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )['last_hidden_state']
+        return prompt_embeds, shared_prompt_embeds, prompt_attention_mask
+import random
+from torch.utils.checkpoint import checkpoint
+from peft import LoraConfig, set_peft_model_state_dict
+class LoraT5EmbedderNoGradientCheck(torch.nn.Module):
+    def __init__(self, device, rank=64, max_length=300):
+        super().__init__()
+        self.device = device
+        self.max_length = max_length
+        dtype = torch.bfloat16
+        self.dtype = dtype
+        t5_version = './t5-v1_1-xxl'
+        self.t5_tokenizer = T5Tokenizer.from_pretrained(t5_version, max_length=max_length)
+        self.t5_encoder = T5EncoderModel.from_pretrained(t5_version, torch_dtype=dtype).to(device=device).to(dtype)
+        self.t5_encoder.gradient_checkpointing_enable()
+        self.t5_encoder.config.gradient_checkpointing = True
+        self.t5_encoder.requires_grad_(False)
+        self.t5_encoder.eval()
+        # Add LoRA adapters to the T5 model
+        text_lora_config = LoraConfig(
+            r=rank,
+            lora_alpha=rank,
+            lora_dropout=0.0,
+            init_lora_weights="gaussian",
+            target_modules=["SelfAttention.q", "SelfAttention.k", "SelfAttention.v", "SelfAttention.o", "DenseReluDense.wi", "DenseReluDense.wo"],
+        )
+        self.t5_encoder.add_adapter(text_lora_config)
+        #self.t5_encoder.encoder.embed_tokens.weight.requires_grad = True
+        print(f"Gradient checkpointing enabled: {self.t5_encoder.is_gradient_checkpointing}")
+        image_encoder_path = 'openai/clip-vit-large-patch14'
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(image_encoder_path).to(device=device).to(torch.bfloat16)
+        self.image_encoder = self.image_encoder.eval().requires_grad_(False)
+    def compute_perturbation_loss(self, prompt_embeds, perturbed_prompt_embeds, replaced_ids, batch_encoding):
+        """
+        Compute group lasso for non-pad non-change tokens, L1 for change tokens,
+        and group sparsity for pad non-change tokens.
+        Args:
+            prompt_embeds: Original embeddings [batch_size, seq_len, hidden_dim]
+            perturbed_prompt_embeds: Perturbed embeddings [batch_size, seq_len, hidden_dim]
+            replaced_ids: List of replaced token indices for each sample in batch
+            batch_encoding: The tokenizer output containing input_ids
+        Returns:
+            l2_loss: Group lasso loss for non-pad non-change tokens (scalar tensor)
+            l1_loss: L1 loss for change tokens (scalar tensor)
+            pad_group_loss: Group sparsity loss for pad non-change tokens (scalar tensor)
+        """
+        batch_size = prompt_embeds.size(0)
+        pad_token_id = self.t5_tokenizer.pad_token_id
+        input_ids = batch_encoding["input_ids"]
+        l2_loss_total = torch.tensor(0.0, device=prompt_embeds.device)
+        l1_loss_total = torch.tensor(0.0, device=prompt_embeds.device)
+        pad_group_loss_total = torch.tensor(0.0, device=prompt_embeds.device)
+        # Track valid samples for each loss type separately
+        l1_valid_samples = 0
+        l2_valid_samples = 0
+        pad_valid_samples = 0
+        for i in range(batch_size):
+            # Get the replaced index for this sample
+            replaced_idx = replaced_ids[i]
+            if replaced_idx is None:
+                # No replacement happened (all padding), skip
+                continue
+            # Find padding and non-padding token indices
+            pad_mask = input_ids[i] == pad_token_id
+            non_pad_mask = ~pad_mask
+            pad_indices = torch.where(pad_mask)[0]
+            non_pad_indices = torch.where(non_pad_mask)[0]
+            # Filter out the replaced index from non-padding indices (non-pad non-change)
+            non_selected_non_pad_indices = non_pad_indices[non_pad_indices != replaced_idx]
+            # Compute L1 loss on selected (replaced) index - CHANGE TOKEN
+            selected_diff = prompt_embeds[i, replaced_idx] - perturbed_prompt_embeds[i, replaced_idx]
+            l1_loss_total = l1_loss_total + torch.abs(selected_diff).mean()
+            l1_valid_samples += 1
+            # Compute group lasso (L2) loss on NON-PAD NON-CHANGE tokens
+            if len(non_selected_non_pad_indices) > 0:
+                non_selected_diff = prompt_embeds[i, non_selected_non_pad_indices] - perturbed_prompt_embeds[
+                    i, non_selected_non_pad_indices]
+                l2_per_token = torch.sqrt((non_selected_diff ** 2).sum(dim=1))
+                l2_loss_total = l2_loss_total + l2_per_token.mean()
+                l2_valid_samples += 1
+            # Compute group sparsity loss on PAD NON-CHANGE tokens
+            if len(pad_indices) > 0:
+                pad_diff = prompt_embeds[i, pad_indices] - perturbed_prompt_embeds[i, pad_indices]
+                # Group sparsity: L2 norm per token (encourages entire token embeddings to be zero)
+                pad_group_per_token = torch.sqrt((pad_diff ** 2).sum(dim=1))
+                pad_group_loss_total = pad_group_loss_total + pad_group_per_token.mean()
+                pad_valid_samples += 1
+        # Average over valid samples for each loss type
+        l2_loss = l2_loss_total / l2_valid_samples if l2_valid_samples > 0 else torch.tensor(0.0,
+                                                                                             device=prompt_embeds.device)
+        l1_loss = l1_loss_total / l1_valid_samples if l1_valid_samples > 0 else torch.tensor(0.0,
+                                                                                             device=prompt_embeds.device)
+        pad_group_loss = pad_group_loss_total / pad_valid_samples if pad_valid_samples > 0 else torch.tensor(0.0,
+                                                                                                             device=prompt_embeds.device)
+        return l2_loss, l1_loss, pad_group_loss
+    def forward(self, text, image=None):
+        if isinstance(text, str):
+            text = [text]
+        batch_encoding = self.t5_tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        prompt_embeds = self.t5_encoder(
+            input_ids=batch_encoding["input_ids"].to(self.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )['last_hidden_state']
+        # Get input_ids and create a copy to modify
+        input_ids = batch_encoding["input_ids"].clone()
+        batch_size = input_ids.size(0)
+        # Get the padding token id
+        pad_token_id = self.t5_tokenizer.pad_token_id
+        replaced_ids = []
+        # For each sample in the batch
+        for i in range(batch_size):
+            # Find indices of non-padding tokens
+            non_pad_mask = input_ids[i] != pad_token_id
+            non_pad_indices = torch.where(non_pad_mask)[0]
+            # If there are meaningful tokens, randomly select one to replace
+            if len(non_pad_indices) > 0:
+                # Randomly select an index from non-padding tokens
+                random_idx = non_pad_indices[random.randint(0, len(non_pad_indices) - 1)]
+                # Replace with padding token
+                input_ids[i, random_idx] = pad_token_id
+                replaced_ids.append(random_idx.item())
+            else:
+                replaced_ids.append(None)  # No replacement if all tokens are padding
+        perturbed_prompt_embeds = self.t5_encoder(
+            input_ids=input_ids.to(self.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )['last_hidden_state']
+        l2_loss, l1_loss, pad_loss = self.compute_perturbation_loss(
+            prompt_embeds, perturbed_prompt_embeds, replaced_ids, batch_encoding
+        )
+        with torch.no_grad():
+            if image is not None:
+                clip_image_embeds = self.image_encoder(image.to(self.device)).image_embeds
+            else:
+                clip_image_embeds = None
+        return prompt_embeds, l2_loss, l1_loss, pad_loss,clip_image_embeds
+from peft import LoraConfig, set_peft_model_state_dict
+import torch.utils.checkpoint as checkpoint
+from transformers import CLIPVisionModelWithProjection
+class LoraT5Embedder(torch.nn.Module):
+    def __init__(self, device, rank=128, max_length=300, use_gradient_checkpointing=True):
+        super().__init__()
+        self.device = device
+        self.max_length = max_length
+        self.use_gradient_checkpointing = use_gradient_checkpointing
+        dtype = torch.bfloat16
+        self.dtype = dtype
+        t5_version = './t5-v1_1-xxl'
+        self.t5_tokenizer = T5Tokenizer.from_pretrained(t5_version, max_length=max_length)
+        self.t5_encoder = T5EncoderModel.from_pretrained(
+            t5_version,
+            torch_dtype=dtype
+        ).to(device=device).to(dtype)
+        self.t5_encoder.requires_grad_(False)
+        # Add LoRA adapters to the T5 model
+        text_lora_config = LoraConfig(
+            r=rank,
+            lora_alpha=rank,
+            lora_dropout=0.0,
+            init_lora_weights="gaussian",
+            target_modules=["q", "k", "v", "o", "wi", "wo"],
+        )
+        self.t5_encoder.add_adapter(text_lora_config)
+        self.t5_encoder.encoder.embed_tokens.weight.requires_grad_(True)
+        # Manually implement gradient checkpointing for T5 encoder blocks
+        if self.use_gradient_checkpointing:
+            self._enable_gradient_checkpointing()
+        print(f"Gradient checkpointing enabled: {self.use_gradient_checkpointing}")
+        image_encoder_path = './clip-vit-large-patch14'
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            image_encoder_path
+        ).to(device=device).to(torch.bfloat16)
+        self.image_encoder = self.image_encoder.eval().requires_grad_(False)
+    def _enable_gradient_checkpointing(self):
+        """
+        Manually wrap T5 encoder blocks with gradient checkpointing.
+        """
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        # Wrap each T5 block with checkpointing
+        for block in self.t5_encoder.encoder.block:
+            # Store original forward
+            block._original_forward = block.forward
+            # Create checkpointed forward
+            def make_checkpointed_forward(blk):
+                def checkpointed_forward(*args, **kwargs):
+                    # Checkpoint requires a function that takes tensors as input
+                    def forward_wrapper(*inputs):
+                        # Reconstruct kwargs from inputs
+                        hidden_states = inputs[0]
+                        attention_mask = inputs[1] if len(inputs) > 1 else None
+                        position_bias = inputs[2] if len(inputs) > 2 else None
+                        return blk._original_forward(
+                            hidden_states=hidden_states,
+                            attention_mask=attention_mask,
+                            position_bias=position_bias,
+                            **{k: v for k, v in kwargs.items() if
+                               k not in ['hidden_states', 'attention_mask', 'position_bias']}
+                        )
+                    # Prepare inputs for checkpointing
+                    hidden_states = kwargs.get('hidden_states', args[0] if args else None)
+                    attention_mask = kwargs.get('attention_mask', args[1] if len(args) > 1 else None)
+                    position_bias = kwargs.get('position_bias', args[2] if len(args) > 2 else None)
+                    # Use checkpoint
+                    checkpoint_inputs = [hidden_states]
+                    if attention_mask is not None:
+                        checkpoint_inputs.append(attention_mask)
+                    if position_bias is not None:
+                        checkpoint_inputs.append(position_bias)
+                    return checkpoint.checkpoint(
+                        forward_wrapper,
+                        *checkpoint_inputs,
+                        use_reentrant=False
+                    )
+                return checkpointed_forward
+            block.forward = make_checkpointed_forward(block)
+    def _encode_text(self, input_ids):
+        """Helper function to encode text through T5."""
+        return self.t5_encoder(
+            input_ids=input_ids.to(self.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )['last_hidden_state']
+    def compute_perturbation_loss(self, prompt_embeds, perturbed_prompt_embeds, replaced_ids, batch_encoding):
+        """
+        Compute group lasso for non-pad non-change tokens, L1 for change tokens,
+        and group sparsity for pad non-change tokens.
+        Args:
+            prompt_embeds: Original embeddings [batch_size, seq_len, hidden_dim]
+            perturbed_prompt_embeds: Perturbed embeddings [batch_size, seq_len, hidden_dim]
+            replaced_ids: List of replaced token indices for each sample in batch
+            batch_encoding: The tokenizer output containing input_ids
+        Returns:
+            l2_loss: Group lasso loss for non-pad non-change tokens (scalar tensor)
+            l1_loss: L1 loss for change tokens (scalar tensor)
+            pad_group_loss: Group sparsity loss for pad non-change tokens (scalar tensor)
+        """
+        batch_size = prompt_embeds.size(0)
+        pad_token_id = self.t5_tokenizer.pad_token_id
+        input_ids = batch_encoding["input_ids"]
+        l2_loss_total = torch.tensor(0.0, device=prompt_embeds.device)
+        l1_loss_total = torch.tensor(0.0, device=prompt_embeds.device)
+        pad_group_loss_total = torch.tensor(0.0, device=prompt_embeds.device)
+        # Track valid samples for each loss type separately
+        l1_valid_samples = 0
+        l2_valid_samples = 0
+        pad_valid_samples = 0
+        for i in range(batch_size):
+            # Get the replaced index for this sample
+            replaced_idx = replaced_ids[i]
+            if replaced_idx is None:
+                # No replacement happened (all padding), skip
+                continue
+            # Find padding and non-padding token indices
+            pad_mask = input_ids[i] == pad_token_id
+            non_pad_mask = ~pad_mask
+            pad_indices = torch.where(pad_mask)[0]
+            non_pad_indices = torch.where(non_pad_mask)[0]
+            # Filter out the replaced index from non-padding indices (non-pad non-change)
+            non_selected_non_pad_indices = non_pad_indices[non_pad_indices != replaced_idx]
+            # Compute L1 loss on selected (replaced) index - CHANGE TOKEN
+            selected_diff = prompt_embeds[i, replaced_idx] - perturbed_prompt_embeds[i, replaced_idx]
+            l1_loss_total = l1_loss_total + torch.abs(selected_diff).mean()
+            l1_valid_samples += 1
+            # Compute group lasso (L2) loss on NON-PAD NON-CHANGE tokens
+            if len(non_selected_non_pad_indices) > 0:
+                non_selected_diff = prompt_embeds[i, non_selected_non_pad_indices] - perturbed_prompt_embeds[
+                    i, non_selected_non_pad_indices]
+                l2_per_token = torch.sqrt((non_selected_diff ** 2).sum(dim=1))
+                l2_loss_total = l2_loss_total + l2_per_token.mean()
+                l2_valid_samples += 1
+            # Compute group sparsity loss on PAD NON-CHANGE tokens
+            if len(pad_indices) > 0:
+                pad_diff = prompt_embeds[i, pad_indices] - perturbed_prompt_embeds[i, pad_indices]
+                # Group sparsity: L2 norm per token (encourages entire token embeddings to be zero)
+                pad_group_per_token = torch.sqrt((pad_diff ** 2).sum(dim=1))
+                pad_group_loss_total = pad_group_loss_total + pad_group_per_token.mean()
+                pad_valid_samples += 1
+        # Average over valid samples for each loss type
+        l2_loss = l2_loss_total / l2_valid_samples if l2_valid_samples > 0 else torch.tensor(0.0,
+                                                                                             device=prompt_embeds.device)
+        l1_loss = l1_loss_total / l1_valid_samples if l1_valid_samples > 0 else torch.tensor(0.0,
+                                                                                             device=prompt_embeds.device)
+        pad_group_loss = pad_group_loss_total / pad_valid_samples if pad_valid_samples > 0 else torch.tensor(0.0,
+                                                                                                             device=prompt_embeds.device)
+        return l2_loss, l1_loss, pad_group_loss
+    def forward(self, text, image=None):
+        if isinstance(text, str):
+            text = [text]
+        batch_encoding = self.t5_tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        attn_mask = batch_encoding["attention_mask"].to(self.device)
+        # First encoding
+        prompt_embeds = self._encode_text(batch_encoding["input_ids"])
+        # Get input_ids and create a copy to modify
+        input_ids = batch_encoding["input_ids"].clone()
+        batch_size = input_ids.size(0)
+        # Get the padding token id
+        # get the id for the first sentinel token
+        mask_token = "<extra_id_0>"
+        mask_token_id = self.t5_tokenizer.convert_tokens_to_ids(mask_token)
+        pad_token_id = self.t5_tokenizer.pad_token_id
+        replaced_ids = []
+        # For each sample in the batch
+        for i in range(batch_size):
+            # Find indices of non-padding tokens
+            non_pad_mask = input_ids[i] != pad_token_id
+            non_pad_indices = torch.where(non_pad_mask)[0]
+            # If there are meaningful tokens, randomly select one to replace
+            if len(non_pad_indices) > 0:
+                # Randomly select an index from non-padding tokens
+                random_idx = non_pad_indices[random.randint(0, len(non_pad_indices) - 1)]
+                random_idx2 = non_pad_indices[random.randint(0, len(non_pad_indices) - 1)]
+                # Replace with padding token
+                input_ids[i, random_idx] = mask_token_id
+                replaced_ids.append(random_idx.item())
+            else:
+                replaced_ids.append(None)  # No replacement if all tokens are padding
+        # Second encoding with perturbed input
+        perturbed_prompt_embeds = self._encode_text(input_ids)
+        """
+        l2_loss, l1_loss, pad_loss = self.compute_perturbation_loss(
+            prompt_embeds, perturbed_prompt_embeds, replaced_ids, batch_encoding
+        )
+        """
+        with torch.no_grad():
+            if image is not None:
+                clip_image_embeds = self.image_encoder(image.to(self.device)).image_embeds
+            else:
+                clip_image_embeds = None
+        #return prompt_embeds, l2_loss, l1_loss, pad_loss, clip_image_embeds, attn_mask
+        return prompt_embeds, clip_image_embeds, perturbed_prompt_embeds, replaced_ids, self.t5_tokenizer, batch_encoding
+from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer
+class QwenEmbedder(nn.Module):
+    def __init__(self, device, max_length=512):
+        super().__init__()
+        self.device = device
+        self.max_length = max_length
+        dtype = torch.bfloat16
+        self.dtype = dtype
+        self.tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", use_fast=True)
+        self.text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype=dtype,
+        ).to(device=device)
+        self.prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        self.prompt_template_encode_start_idx = 34
+        self.tokenizer_max_length = max_length
+    def _extract_masked_hidden(self, hidden_states: torch.Tensor, mask: torch.Tensor):
+        bool_mask = mask.bool()
+        valid_lengths = bool_mask.sum(dim=1)
+        selected = hidden_states[bool_mask]
+        split_result = torch.split(selected, valid_lengths.tolist(), dim=0)
+        return split_result
+    def _get_qwen_prompt_embeds(
+            self,
+            prompt = None,
+            device = None,
+            dtype = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        template = self.prompt_template_encode
+        drop_idx = self.prompt_template_encode_start_idx
+        txt = [template.format(e) for e in prompt]
+        txt_tokens = self.tokenizer(
+            txt, max_length=self.tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt"
+        ).to(device)
+        encoder_hidden_states = self.text_encoder(
+            input_ids=txt_tokens.input_ids,
+            attention_mask=txt_tokens.attention_mask,
+            output_hidden_states=True,
+        )
+        hidden_states = encoder_hidden_states.hidden_states[-1]
+        split_hidden_states = self._extract_masked_hidden(hidden_states, txt_tokens.attention_mask)
+        split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
+        attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
+        #max_seq_len = max([e.size(0) for e in split_hidden_states])
+        max_seq_len = self.max_length
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
+        )
+        encoder_attention_mask = torch.stack(
+            [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
+        )
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        return prompt_embeds, encoder_attention_mask
+    @torch.no_grad()
+    def forward(self, text):
+        prompt_embeds, attention_mask = self._get_qwen_prompt_embeds(
+            prompt=text,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        return prompt_embeds, attention_mask
+# pip install accelerate
+from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+from PIL import Image
+import requests
+import torch
+import torch.nn as nn
+Qwen25VL_7b_PREFIX_edit = '''Given an user editing prompt and an source image, only describe the editing area and how they should change in a detailed way.
+Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:
+                       '''
+Qwen25VL_7b_PREFIX_t2i = '''Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:
+- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.
+- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.\n
+Here are examples of how to transform or refine prompts:
+- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.
+- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.\n
+Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:
+User Prompt:'''
+Qwen25VL_7b_PREFIX_image = "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate."
+model_id = "google/gemma-3-4b-it"
+from transformers import AutoTokenizer, TrainingArguments, Gemma3ForCausalLM, AutoModel, Gemma3Model
+from transformers import Dinov2Model, AutoImageProcessor
+import torch
+import torchvision.transforms as transforms
+import torchvision.models as models
+from PIL import Image
+import numpy as np
+class GemmaEmbedder(nn.Module):
+    def __init__(self, max_sequence_length=300, model_id='google/gemma-3-4b-it'):
+        super().__init__()
+        device = torch.cuda.current_device()
+        self.model = Gemma3Model.from_pretrained(model_id).to(device).to(torch.bfloat16)
+        #self.model = Gemma3ForConditionalGeneration.from_pretrained(model_id).to(device).to(torch.bfloat16)
+        self.processor = AutoProcessor.from_pretrained(model_id)
+        self.device = device
+        self.max_sequence_length = max_sequence_length
+        #self.processor.tokenizer.pad_token = self.processor.tokenizer.eos_token  # Use eos token as pad token
+        self.processor.tokenizer.padding_side = "right"
+    def get_features(self, hidden_states, input_ids):
+        hidden_states = hidden_states[0]
+        input_ids = input_ids[0].tolist()
+        pad_text_embeds = torch.zeros([self.max_sequence_length, 2560], dtype=torch.bfloat16, device=self.device)
+        pad_text_mask = torch.zeros([self.max_sequence_length], device=self.device)
+        def find_last(lst, value):
+            indices = [i for i, x in enumerate(lst) if x == value]
+            return indices[-1]
+        if 256000 in input_ids:
+            text_start = input_ids.index(256000)+2
+        else:
+            text_start = find_last(input_ids, 108)+1
+        text_end = len(input_ids)-6
+        bos_embed = hidden_states[:2]
+        text_embeds = hidden_states[text_start:text_end + 1]
+        text_embeds = torch.cat([bos_embed, text_embeds], dim=0)
+        pad_text_embeds[:len(text_embeds), :] = text_embeds[:self.max_sequence_length]
+        pad_text_mask[:len(text_embeds)] = 1.0
+        image_embeds = hidden_states[np.array(input_ids) == self.processor.tokenizer.image_token_id]
+        """
+        print(input_ids)
+        print(input_ids[text_start:text_end + 1])
+        decoded = self.processor.decode(input_ids[text_start:text_end+1], skip_special_tokens=False)
+        print("Decoded text:", decoded, text_start, text_end, input_ids[text_start:text_end + 1], input_ids[1:2])
+        print("Text embeddings shape:", text_embeds.shape)
+        norm = RMSNorm(2560, eps=1e-6).to(self.device).to(torch.bfloat16)
+        print(text_embeds, ' >>> ext embeds')
+        print(norm(text_embeds), ' >>> normed embeds')
+        """
+        return image_embeds, pad_text_embeds, pad_text_mask
+    @torch.no_grad()
+    def forward(self, caps, images=None):
+        text_embeds = []
+        text_masks = []
+        full_image_embeds = []
+        device = self.model.device
+        if images is None:
+            images = [None] * len(caps)
+        for cap,img in zip(caps, images):
+            if img is not None:
+                messages = [
+                {
+                    "role": "system",
+                    "content": [{"type": "text", "text": Qwen25VL_7b_PREFIX_edit}]
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": img},
+                        {"type": "text", "text": cap},
+                    ]
+                }
+                ]
+            else:
+                messages = [
+                    {
+                        "role": "system",
+                        "content": [{"type": "text", "text": Qwen25VL_7b_PREFIX_t2i}]
+                    },
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": cap},
+                        ]
+                    }
+                ]
+            inputs = self.processor.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=True,
+                return_dict=True, return_tensors="pt",
+                max_length = 640,
+                truncation = True,
+            ).to(self.model.device, dtype=torch.bfloat16)
+            outputs = self.model(**inputs, output_hidden_states=True)
+            #sample_image_embeds = outputs.image_hidden_states
+            sample_text_embeds, sample_text_mask, sample_image_embeds = [], [], []
+            for hidden in [outputs.hidden_states[-1]]:
+                cur_image_embeds, cur_text_embeds, cur_text_mask = self.get_features(hidden, inputs["input_ids"])
+                sample_text_embeds.append(cur_text_embeds)
+                sample_text_mask.append(cur_text_mask)
+                sample_image_embeds.append(cur_image_embeds)
+            text_embeds.append(torch.cat(sample_text_embeds, dim=0))
+            text_masks.append(torch.cat(sample_text_mask, dim=0))
+            #full_image_embeds.append(sample_image_embeds)
+            full_image_embeds.append(torch.cat(sample_image_embeds, dim=0))
+            """
+            input_len = inputs["input_ids"].shape[-1]
+            with torch.inference_mode():
+                generation = self.model.generate(**inputs, max_new_tokens=100, do_sample=False)
+                generation = generation[0][input_len:]
+                decoded = self.processor.decode(generation, skip_special_tokens=True)
+            print(cap, ' <>>> gemma ',decoded)
+            """
+        text_embeds = torch.stack(text_embeds, dim=0)
+        text_masks = torch.stack(text_masks, dim=0)
+        full_image_embeds = torch.stack(full_image_embeds, dim=0)
+        return {
+            'text_embeds': text_embeds,
+            'text_masks': text_masks,
+            'image_embeds': full_image_embeds,
+        }
+class GemmaTextEmbedder(nn.Module):
+    def __init__(self, device, max_sequence_length=300, model_id='./gemma-3-4b-it'):
+        super().__init__()
+        self.model = Gemma3Model.from_pretrained(model_id).to(device).to(torch.bfloat16)
+        #self.model = Gemma3ForConditionalGeneration.from_pretrained(model_id).to(device).to(torch.bfloat16)
+        self.processor = AutoProcessor.from_pretrained(model_id)
+        self.real_device = device
+        self.max_sequence_length = max_sequence_length
+        #self.processor.tokenizer.pad_token = self.processor.tokenizer.eos_token  # Use eos token as pad token
+        self.processor.tokenizer.padding_side = "right"
+    @property
+    def dtype(self):
+        """Return the dtype of the model parameters."""
+        return next(self.parameters()).dtype
+    @property
+    def device(self):
+        """Return the device of the model parameters."""
+        return next(self.parameters()).device
+    def get_features(self, hidden_states, input_ids):
+        hidden_states = hidden_states[0]
+        input_ids = input_ids[0].tolist()
+        pad_text_embeds = torch.zeros([self.max_sequence_length, 2560], dtype=torch.bfloat16, device=self.device)
+        pad_text_mask = torch.zeros([self.max_sequence_length], device=self.device)
+        def find_last(lst, value):
+            indices = [i for i, x in enumerate(lst) if x == value]
+            return indices[-1]
+        if 256000 in input_ids:
+            text_start = input_ids.index(256000)+2
+        else:
+            text_start = find_last(input_ids, 108)+1
+        text_end = len(input_ids)-6
+        bos_embed = hidden_states[:2]
+        text_embeds = hidden_states[text_start:text_end + 1]
+        text_embeds = torch.cat([bos_embed, text_embeds], dim=0)
+        pad_text_embeds[:len(text_embeds), :] = text_embeds[:self.max_sequence_length]
+        pad_text_mask[:len(text_embeds)] = 1.0
+        pad_text_embeds[len(text_embeds):, :] = 0.0
+        pad_text_mask[len(text_embeds):] = 0.0
+        """
+        print(input_ids)
+        print(input_ids[text_start:text_end + 1])
+        decoded = self.processor.decode(input_ids[text_start:text_end+1], skip_special_tokens=False)
+        print("Decoded text:", decoded, text_start, text_end, input_ids[text_start:text_end + 1], input_ids[1:2])
+        print("Text embeddings shape:", text_embeds.shape)
+        print(text_embeds, ' >>> ext embeds')
+        """
+        return pad_text_embeds, pad_text_mask
+    @torch.no_grad()
+    def forward(self, caps, images=None):
+        text_embeds = []
+        text_masks = []
+        full_image_embeds = []
+        device = self.model.device
+        if isinstance(caps, str):
+            caps = [caps]
+        if images is None:
+            images = [None] * len(caps)
+        for cap,img in zip(caps, images):
+            if img is not None:
+                messages = [
+                    {
+                        "role": "system",
+                        "content": [{"type": "text", "text": Qwen25VL_7b_PREFIX_edit}]
+                    },
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "image", "image": img},
+                            {"type": "text", "text": cap},
+                        ]
+                    }
+                ]
+            else:
+                messages = [
+                    {
+                        "role": "system",
+                        "content": [{"type": "text", "text": Qwen25VL_7b_PREFIX_t2i}]
+                    },
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": cap},
+                        ]
+                    }
+                ]
+            inputs = self.processor.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=True,
+                return_dict=True, return_tensors="pt",
+                max_length = 640,
+                truncation = True,
+            ).to(self.model.device, dtype=torch.bfloat16)
+            outputs = self.model(**inputs, output_hidden_states=True)
+            #sample_image_embeds = outputs.image_hidden_states
+            sample_text_embeds, sample_text_mask, sample_image_embeds = [], [], []
+            for hidden in [outputs.hidden_states[-1]]:
+                cur_text_embeds, cur_text_mask = self.get_features(hidden, inputs["input_ids"])
+                sample_text_embeds.append(cur_text_embeds)
+                sample_text_mask.append(cur_text_mask)
+            text_embeds.append(torch.cat(sample_text_embeds, dim=0))
+            text_masks.append(torch.cat(sample_text_mask, dim=0))
+            """
+            input_len = inputs["input_ids"].shape[-1]
+            with torch.inference_mode():
+                generation = self.model.generate(**inputs, max_new_tokens=100, do_sample=False)
+                generation = generation[0][input_len:]
+                decoded = self.processor.decode(generation, skip_special_tokens=True)
+            print(cap, ' <>>> gemma ',decoded)
+            """
+        text_embeds = torch.stack(text_embeds, dim=0)
+        text_masks = torch.stack(text_masks, dim=0)
+        return text_embeds, text_masks.to(text_embeds.dtype)
+from transformers import AutoModel, AutoTokenizer
+from transformers import SiglipVisionModel, AutoProcessor
+class Gemma2Embedder(nn.Module):
+    def __init__(self, max_length=300):
+        super().__init__()
+        self.text_encoder = AutoModel.from_pretrained(
+            "google/gemma-2-2b",
+            torch_dtype=torch.bfloat16,
+        ).to(torch.cuda.current_device()).to(torch.bfloat16).eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            "google/gemma-2-2b",
+        )
+        self.tokenizer.padding_side = "right"
+        self.max_length = max_length
+        self.system_prompt = "You are an assistant designed to edit images faithfully based on user prompts. <Prompt Start> "
+        system_ids = self.tokenizer(
+            self.system_prompt,
+            return_tensors="pt",
+            add_special_tokens=True,
+            max_length=self.max_length,
+            padding="max_length",
+            truncation=True,
+        ).input_ids.flatten().view(-1).numpy().tolist()
+        self.len_system_prompt = system_ids.index(self.tokenizer.pad_token_id)-1
+        self.weight_dtype = torch.bfloat16
+    @torch.no_grad()
+    def forward(self, caption):
+        if isinstance(caption, str):
+            caption = [caption]
+        caption = [self.system_prompt + c for c in caption]
+        text_inputs = self.tokenizer(
+            caption,
+            return_tensors="pt",
+            add_special_tokens=True,
+            max_length=self.max_length+self.len_system_prompt,
+            padding="max_length",
+            truncation=True,
+        )
+        text_input_ids = text_inputs.input_ids
+        attention_mask = text_inputs.attention_mask
+        text_input_ids = text_input_ids.to(self.text_encoder.device)
+        attention_mask = attention_mask.to(self.text_encoder.device)
+        embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask,
+                                    output_hidden_states=True
+                                    ).hidden_states[-2]
+        embeds = embeds[:, self.len_system_prompt:, :]
+        attention_mask = attention_mask[:, self.len_system_prompt:]
+        return {
+            'text_embeds': embeds,
+            'text_masks': attention_mask,
+        }
+class T5TextEmbedder(nn.Module):
+    def __init__(self, device, pretrained_path="google/flan-t5-xxl", max_length=300):
+        super().__init__()
+        self.model = T5EncoderModel.from_pretrained(pretrained_path).to(device=device).to(torch.bfloat16)
+        self.tokenizer = T5Tokenizer.from_pretrained(pretrained_path)
+        self.max_length = max_length
+        self.model.eval()
+        self.model.requires_grad_(False)
+    @property
+    def dtype(self):
+        """Return the dtype of the model parameters."""
+        return next(self.parameters()).dtype
+    @property
+    def device(self):
+        """Return the device of the model parameters."""
+        return next(self.parameters()).device
+    def forward(
+        self, caption
+    ):
+        max_length = self.max_length
+        text_inputs = self.tokenizer(
+                    caption,
+                    return_tensors="pt",
+                    add_special_tokens=True,
+                    max_length=max_length,
+                    padding="max_length",
+                    truncation=True,
+                )
+        text_input_ids = text_inputs.input_ids
+        attention_mask = text_inputs.attention_mask
+        text_input_ids = text_input_ids.to(self.model.device)
+        attention_mask = attention_mask.to(self.model.device)
+        outputs = self.model(text_input_ids, attention_mask=attention_mask)
+        embeddings = outputs.last_hidden_state
+        return embeddings, attention_mask.to(embeddings.dtype)
+if __name__ == '__main__':
+    from datasets import load_dataset
+    dataset = load_dataset("facebook/emu_edit_test_set", split='validation[:200]')
+    item = dataset[0:4]
+    another_item = dataset[0:4]
+    from diffusers.models.normalization import RMSNorm
+    image_encoder = CLIPImageEncoder(device="cuda:0")
+    clip_processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")
+    image_embeds = image_encoder(clip_processor(images=item['image'], return_tensors="pt").pixel_values.to("cuda:0").to(torch.bfloat16))
+    print(image_embeds.shape, ' >>>> image embeds')
+    #model = GemmaTextEmbedder(device="cuda:0")
+    model = LoraT5Embedder(device="cuda:0")
+    prompt_embeds, l2_loss, l1_loss, pad_loss, clip_image_embeds, attn_mask = model(
+        [
+            """A heartwarming 3D rendered scene of
+an elderly farmer and a tiny orange
+kitten. The farmer, with a gentle smile,
+walks alongside the kitten in a lush,
+green garden filled with thriving plants,
+showcasing a fruitful harvest. The
+intricate details of the overalls and the
+farmer's worn, weathered face tell a
+story of years spent tending to the land, the farmer is wearing a blue shirt""",
+        ],
+        image=clip_processor(images=item['image'], return_tensors="pt").pixel_values.to("cuda:0").to(torch.bfloat16
+    ))
+    print(l2_loss, ' >>> l2 loss ', l1_loss, ' >>> l1 loss ', pad_loss, ' >>> pad loss ')
+    print(clip_image_embeds.shape, ' >>> clip image embeds ')
+    #print(gemma_dict['text_embeds'],)
+    #print(gemma_dict['image_embeds'], ' >>> image embeds')
+    """
+    from dataset import create_loader
+    from PIL import Image as PILImage
+    from PIL import Image as PILImage
+    import PIL
+    import numpy as np
+    import torch.nn.functional as F
+    loader = create_loader('edit', batch_size=16, shuffle=False)
+    batch = next(iter(loader))
+    source = batch['source_images']
+    source_pils = [PIL.Image.fromarray(((x.permute(1, 2, 0).cpu().numpy() + 1) * 127.5).astype(np.uint8)) for x in source]
+    target = batch['target_images']
+    target_pils = [PIL.Image.fromarray(((x.permute(1, 2, 0).cpu().numpy() + 1) * 127.5).astype(np.uint8)) for x in target]
+    from torchvision.utils import save_image
+    print(batch['captions'])
+    images = []
+    for (x, y) in zip(batch['source_images'], batch['target_images']):
+        images.append(x)
+        images.append(y)
+    save_image((torch.stack(images) + 1) / 2, 'example_pairs.jpg', nrow=8)
+    gemma_dict = model(batch['captions'], source_pils, target_pils)
+    image_embeds = gemma_dict['image_embeds']
+    target_image_embeds = gemma_dict['target_image_embeds']
+    print("Image embeds shape:", image_embeds.shape)
+    print("Target image embeds shape:", target_image_embeds.shape)
+    from qwen import compute_and_save_similarity_grid
+    compute_and_save_similarity_grid(image_embeds, target_image_embeds, "gemma_similarity_grid.jpg")
+    """