Spaces:

Shaoan
/

ConceptAligner

Sleeping

App Files Files Community

Shaoan commited on Mar 12

Commit

ead4126

verified ·

1 Parent(s): 523214a

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

text_encoder.py +5 -1018

text_encoder.py CHANGED Viewed

@@ -207,694 +207,6 @@ import random
 from torch.utils.checkpoint import checkpoint
 from peft import LoraConfig, set_peft_model_state_dict
-class LoraT5EmbedderNoGradientCheck(torch.nn.Module):
-    def __init__(self, device, rank=64, max_length=300):
-        super().__init__()
-        self.device = device
-        self.max_length = max_length
-        dtype = torch.bfloat16
-        self.dtype = dtype
-        t5_version = './t5-v1_1-xxl'
-        self.t5_tokenizer = T5Tokenizer.from_pretrained(t5_version, max_length=max_length)
-        self.t5_encoder = T5EncoderModel.from_pretrained(t5_version, torch_dtype=dtype).to(device=device).to(dtype)
-        self.t5_encoder.gradient_checkpointing_enable()
-        self.t5_encoder.config.gradient_checkpointing = True
-        self.t5_encoder.requires_grad_(False)
-        self.t5_encoder.eval()
-        # Add LoRA adapters to the T5 model
-        text_lora_config = LoraConfig(
-            r=rank,
-            lora_alpha=rank,
-            lora_dropout=0.0,
-            init_lora_weights="gaussian",
-            target_modules=["SelfAttention.q", "SelfAttention.k", "SelfAttention.v", "SelfAttention.o", "DenseReluDense.wi", "DenseReluDense.wo"],
-        )
-        self.t5_encoder.add_adapter(text_lora_config)
-        #self.t5_encoder.encoder.embed_tokens.weight.requires_grad = True
-        print(f"Gradient checkpointing enabled: {self.t5_encoder.is_gradient_checkpointing}")
-        image_encoder_path = 'openai/clip-vit-large-patch14'
-        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(image_encoder_path).to(device=device).to(torch.bfloat16)
-        self.image_encoder = self.image_encoder.eval().requires_grad_(False)
-    def compute_perturbation_loss(self, prompt_embeds, perturbed_prompt_embeds, replaced_ids, batch_encoding):
-        """
-        Compute group lasso for non-pad non-change tokens, L1 for change tokens,
-        and group sparsity for pad non-change tokens.
-        Args:
-            prompt_embeds: Original embeddings [batch_size, seq_len, hidden_dim]
-            perturbed_prompt_embeds: Perturbed embeddings [batch_size, seq_len, hidden_dim]
-            replaced_ids: List of replaced token indices for each sample in batch
-            batch_encoding: The tokenizer output containing input_ids
-        Returns:
-            l2_loss: Group lasso loss for non-pad non-change tokens (scalar tensor)
-            l1_loss: L1 loss for change tokens (scalar tensor)
-            pad_group_loss: Group sparsity loss for pad non-change tokens (scalar tensor)
-        """
-        batch_size = prompt_embeds.size(0)
-        pad_token_id = self.t5_tokenizer.pad_token_id
-        input_ids = batch_encoding["input_ids"]
-        l2_loss_total = torch.tensor(0.0, device=prompt_embeds.device)
-        l1_loss_total = torch.tensor(0.0, device=prompt_embeds.device)
-        pad_group_loss_total = torch.tensor(0.0, device=prompt_embeds.device)
-        # Track valid samples for each loss type separately
-        l1_valid_samples = 0
-        l2_valid_samples = 0
-        pad_valid_samples = 0
-        for i in range(batch_size):
-            # Get the replaced index for this sample
-            replaced_idx = replaced_ids[i]
-            if replaced_idx is None:
-                # No replacement happened (all padding), skip
-                continue
-            # Find padding and non-padding token indices
-            pad_mask = input_ids[i] == pad_token_id
-            non_pad_mask = ~pad_mask
-            pad_indices = torch.where(pad_mask)[0]
-            non_pad_indices = torch.where(non_pad_mask)[0]
-            # Filter out the replaced index from non-padding indices (non-pad non-change)
-            non_selected_non_pad_indices = non_pad_indices[non_pad_indices != replaced_idx]
-            # Compute L1 loss on selected (replaced) index - CHANGE TOKEN
-            selected_diff = prompt_embeds[i, replaced_idx] - perturbed_prompt_embeds[i, replaced_idx]
-            l1_loss_total = l1_loss_total + torch.abs(selected_diff).mean()
-            l1_valid_samples += 1
-            # Compute group lasso (L2) loss on NON-PAD NON-CHANGE tokens
-            if len(non_selected_non_pad_indices) > 0:
-                non_selected_diff = prompt_embeds[i, non_selected_non_pad_indices] - perturbed_prompt_embeds[
-                    i, non_selected_non_pad_indices]
-                l2_per_token = torch.sqrt((non_selected_diff ** 2).sum(dim=1))
-                l2_loss_total = l2_loss_total + l2_per_token.mean()
-                l2_valid_samples += 1
-            # Compute group sparsity loss on PAD NON-CHANGE tokens
-            if len(pad_indices) > 0:
-                pad_diff = prompt_embeds[i, pad_indices] - perturbed_prompt_embeds[i, pad_indices]
-                # Group sparsity: L2 norm per token (encourages entire token embeddings to be zero)
-                pad_group_per_token = torch.sqrt((pad_diff ** 2).sum(dim=1))
-                pad_group_loss_total = pad_group_loss_total + pad_group_per_token.mean()
-                pad_valid_samples += 1
-        # Average over valid samples for each loss type
-        l2_loss = l2_loss_total / l2_valid_samples if l2_valid_samples > 0 else torch.tensor(0.0,
-                                                                                             device=prompt_embeds.device)
-        l1_loss = l1_loss_total / l1_valid_samples if l1_valid_samples > 0 else torch.tensor(0.0,
-                                                                                             device=prompt_embeds.device)
-        pad_group_loss = pad_group_loss_total / pad_valid_samples if pad_valid_samples > 0 else torch.tensor(0.0,
-                                                                                                             device=prompt_embeds.device)
-        return l2_loss, l1_loss, pad_group_loss
-    def forward(self, text, image=None):
-        if isinstance(text, str):
-            text = [text]
-        batch_encoding = self.t5_tokenizer(
-            text,
-            truncation=True,
-            max_length=self.max_length,
-            return_length=False,
-            return_overflowing_tokens=False,
-            padding="max_length",
-            return_tensors="pt",
-        )
-        prompt_embeds = self.t5_encoder(
-            input_ids=batch_encoding["input_ids"].to(self.device),
-            attention_mask=None,
-            output_hidden_states=False,
-        )['last_hidden_state']
-        # Get input_ids and create a copy to modify
-        input_ids = batch_encoding["input_ids"].clone()
-        batch_size = input_ids.size(0)
-        # Get the padding token id
-        pad_token_id = self.t5_tokenizer.pad_token_id
-        replaced_ids = []
-        # For each sample in the batch
-        for i in range(batch_size):
-            # Find indices of non-padding tokens
-            non_pad_mask = input_ids[i] != pad_token_id
-            non_pad_indices = torch.where(non_pad_mask)[0]
-            # If there are meaningful tokens, randomly select one to replace
-            if len(non_pad_indices) > 0:
-                # Randomly select an index from non-padding tokens
-                random_idx = non_pad_indices[random.randint(0, len(non_pad_indices) - 1)]
-                # Replace with padding token
-                input_ids[i, random_idx] = pad_token_id
-                replaced_ids.append(random_idx.item())
-            else:
-                replaced_ids.append(None)  # No replacement if all tokens are padding
-        perturbed_prompt_embeds = self.t5_encoder(
-            input_ids=input_ids.to(self.device),
-            attention_mask=None,
-            output_hidden_states=False,
-        )['last_hidden_state']
-        l2_loss, l1_loss, pad_loss = self.compute_perturbation_loss(
-            prompt_embeds, perturbed_prompt_embeds, replaced_ids, batch_encoding
-        )
-        with torch.no_grad():
-            if image is not None:
-                clip_image_embeds = self.image_encoder(image.to(self.device)).image_embeds
-            else:
-                clip_image_embeds = None
-        return prompt_embeds, l2_loss, l1_loss, pad_loss,clip_image_embeds
-from peft import LoraConfig, set_peft_model_state_dict
-import torch.utils.checkpoint as checkpoint
-from transformers import CLIPVisionModelWithProjection
-class LoraT5Embedder(torch.nn.Module):
-    def __init__(self, device, rank=128, max_length=300, use_gradient_checkpointing=True):
-        super().__init__()
-        self.device = device
-        self.max_length = max_length
-        self.use_gradient_checkpointing = use_gradient_checkpointing
-        dtype = torch.bfloat16
-        self.dtype = dtype
-        t5_version = './t5-v1_1-xxl'
-        self.t5_tokenizer = T5Tokenizer.from_pretrained(t5_version, max_length=max_length)
-        self.t5_encoder = T5EncoderModel.from_pretrained(
-            t5_version,
-            torch_dtype=dtype
-        ).to(device=device).to(dtype)
-        self.t5_encoder.requires_grad_(False)
-        # Add LoRA adapters to the T5 model
-        text_lora_config = LoraConfig(
-            r=rank,
-            lora_alpha=rank,
-            lora_dropout=0.0,
-            init_lora_weights="gaussian",
-            target_modules=["q", "k", "v", "o", "wi", "wo"],
-        )
-        self.t5_encoder.add_adapter(text_lora_config)
-        self.t5_encoder.encoder.embed_tokens.weight.requires_grad_(True)
-        # Manually implement gradient checkpointing for T5 encoder blocks
-        if self.use_gradient_checkpointing:
-            self._enable_gradient_checkpointing()
-        print(f"Gradient checkpointing enabled: {self.use_gradient_checkpointing}")
-        image_encoder_path = './clip-vit-large-patch14'
-        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-            image_encoder_path
-        ).to(device=device).to(torch.bfloat16)
-        self.image_encoder = self.image_encoder.eval().requires_grad_(False)
-    def _enable_gradient_checkpointing(self):
-        """
-        Manually wrap T5 encoder blocks with gradient checkpointing.
-        """
-        def create_custom_forward(module):
-            def custom_forward(*inputs):
-                return module(*inputs)
-            return custom_forward
-        # Wrap each T5 block with checkpointing
-        for block in self.t5_encoder.encoder.block:
-            # Store original forward
-            block._original_forward = block.forward
-            # Create checkpointed forward
-            def make_checkpointed_forward(blk):
-                def checkpointed_forward(*args, **kwargs):
-                    # Checkpoint requires a function that takes tensors as input
-                    def forward_wrapper(*inputs):
-                        # Reconstruct kwargs from inputs
-                        hidden_states = inputs[0]
-                        attention_mask = inputs[1] if len(inputs) > 1 else None
-                        position_bias = inputs[2] if len(inputs) > 2 else None
-                        return blk._original_forward(
-                            hidden_states=hidden_states,
-                            attention_mask=attention_mask,
-                            position_bias=position_bias,
-                            **{k: v for k, v in kwargs.items() if
-                               k not in ['hidden_states', 'attention_mask', 'position_bias']}
-                        )
-                    # Prepare inputs for checkpointing
-                    hidden_states = kwargs.get('hidden_states', args[0] if args else None)
-                    attention_mask = kwargs.get('attention_mask', args[1] if len(args) > 1 else None)
-                    position_bias = kwargs.get('position_bias', args[2] if len(args) > 2 else None)
-                    # Use checkpoint
-                    checkpoint_inputs = [hidden_states]
-                    if attention_mask is not None:
-                        checkpoint_inputs.append(attention_mask)
-                    if position_bias is not None:
-                        checkpoint_inputs.append(position_bias)
-                    return checkpoint.checkpoint(
-                        forward_wrapper,
-                        *checkpoint_inputs,
-                        use_reentrant=False
-                    )
-                return checkpointed_forward
-            block.forward = make_checkpointed_forward(block)
-    def _encode_text(self, input_ids):
-        """Helper function to encode text through T5."""
-        return self.t5_encoder(
-            input_ids=input_ids.to(self.device),
-            attention_mask=None,
-            output_hidden_states=False,
-        )['last_hidden_state']
-    def compute_perturbation_loss(self, prompt_embeds, perturbed_prompt_embeds, replaced_ids, batch_encoding):
-        """
-        Compute group lasso for non-pad non-change tokens, L1 for change tokens,
-        and group sparsity for pad non-change tokens.
-        Args:
-            prompt_embeds: Original embeddings [batch_size, seq_len, hidden_dim]
-            perturbed_prompt_embeds: Perturbed embeddings [batch_size, seq_len, hidden_dim]
-            replaced_ids: List of replaced token indices for each sample in batch
-            batch_encoding: The tokenizer output containing input_ids
-        Returns:
-            l2_loss: Group lasso loss for non-pad non-change tokens (scalar tensor)
-            l1_loss: L1 loss for change tokens (scalar tensor)
-            pad_group_loss: Group sparsity loss for pad non-change tokens (scalar tensor)
-        """
-        batch_size = prompt_embeds.size(0)
-        pad_token_id = self.t5_tokenizer.pad_token_id
-        input_ids = batch_encoding["input_ids"]
-        l2_loss_total = torch.tensor(0.0, device=prompt_embeds.device)
-        l1_loss_total = torch.tensor(0.0, device=prompt_embeds.device)
-        pad_group_loss_total = torch.tensor(0.0, device=prompt_embeds.device)
-        # Track valid samples for each loss type separately
-        l1_valid_samples = 0
-        l2_valid_samples = 0
-        pad_valid_samples = 0
-        for i in range(batch_size):
-            # Get the replaced index for this sample
-            replaced_idx = replaced_ids[i]
-            if replaced_idx is None:
-                # No replacement happened (all padding), skip
-                continue
-            # Find padding and non-padding token indices
-            pad_mask = input_ids[i] == pad_token_id
-            non_pad_mask = ~pad_mask
-            pad_indices = torch.where(pad_mask)[0]
-            non_pad_indices = torch.where(non_pad_mask)[0]
-            # Filter out the replaced index from non-padding indices (non-pad non-change)
-            non_selected_non_pad_indices = non_pad_indices[non_pad_indices != replaced_idx]
-            # Compute L1 loss on selected (replaced) index - CHANGE TOKEN
-            selected_diff = prompt_embeds[i, replaced_idx] - perturbed_prompt_embeds[i, replaced_idx]
-            l1_loss_total = l1_loss_total + torch.abs(selected_diff).mean()
-            l1_valid_samples += 1
-            # Compute group lasso (L2) loss on NON-PAD NON-CHANGE tokens
-            if len(non_selected_non_pad_indices) > 0:
-                non_selected_diff = prompt_embeds[i, non_selected_non_pad_indices] - perturbed_prompt_embeds[
-                    i, non_selected_non_pad_indices]
-                l2_per_token = torch.sqrt((non_selected_diff ** 2).sum(dim=1))
-                l2_loss_total = l2_loss_total + l2_per_token.mean()
-                l2_valid_samples += 1
-            # Compute group sparsity loss on PAD NON-CHANGE tokens
-            if len(pad_indices) > 0:
-                pad_diff = prompt_embeds[i, pad_indices] - perturbed_prompt_embeds[i, pad_indices]
-                # Group sparsity: L2 norm per token (encourages entire token embeddings to be zero)
-                pad_group_per_token = torch.sqrt((pad_diff ** 2).sum(dim=1))
-                pad_group_loss_total = pad_group_loss_total + pad_group_per_token.mean()
-                pad_valid_samples += 1
-        # Average over valid samples for each loss type
-        l2_loss = l2_loss_total / l2_valid_samples if l2_valid_samples > 0 else torch.tensor(0.0,
-                                                                                             device=prompt_embeds.device)
-        l1_loss = l1_loss_total / l1_valid_samples if l1_valid_samples > 0 else torch.tensor(0.0,
-                                                                                             device=prompt_embeds.device)
-        pad_group_loss = pad_group_loss_total / pad_valid_samples if pad_valid_samples > 0 else torch.tensor(0.0,
-                                                                                                             device=prompt_embeds.device)
-        return l2_loss, l1_loss, pad_group_loss
-    def forward(self, text, image=None):
-        if isinstance(text, str):
-            text = [text]
-        batch_encoding = self.t5_tokenizer(
-            text,
-            truncation=True,
-            max_length=self.max_length,
-            return_length=False,
-            return_overflowing_tokens=False,
-            padding="max_length",
-            return_tensors="pt",
-        )
-        attn_mask = batch_encoding["attention_mask"].to(self.device)
-        # First encoding
-        prompt_embeds = self._encode_text(batch_encoding["input_ids"])
-        # Get input_ids and create a copy to modify
-        input_ids = batch_encoding["input_ids"].clone()
-        batch_size = input_ids.size(0)
-        # Get the padding token id
-        # get the id for the first sentinel token
-        mask_token = "<extra_id_0>"
-        mask_token_id = self.t5_tokenizer.convert_tokens_to_ids(mask_token)
-        pad_token_id = self.t5_tokenizer.pad_token_id
-        replaced_ids = []
-        # For each sample in the batch
-        for i in range(batch_size):
-            # Find indices of non-padding tokens
-            non_pad_mask = input_ids[i] != pad_token_id
-            non_pad_indices = torch.where(non_pad_mask)[0]
-            # If there are meaningful tokens, randomly select one to replace
-            if len(non_pad_indices) > 0:
-                # Randomly select an index from non-padding tokens
-                random_idx = non_pad_indices[random.randint(0, len(non_pad_indices) - 1)]
-                random_idx2 = non_pad_indices[random.randint(0, len(non_pad_indices) - 1)]
-                # Replace with padding token
-                input_ids[i, random_idx] = mask_token_id
-                replaced_ids.append(random_idx.item())
-            else:
-                replaced_ids.append(None)  # No replacement if all tokens are padding
-        # Second encoding with perturbed input
-        perturbed_prompt_embeds = self._encode_text(input_ids)
-        """
-        l2_loss, l1_loss, pad_loss = self.compute_perturbation_loss(
-            prompt_embeds, perturbed_prompt_embeds, replaced_ids, batch_encoding
-        )
-        """
-        with torch.no_grad():
-            if image is not None:
-                clip_image_embeds = self.image_encoder(image.to(self.device)).image_embeds
-            else:
-                clip_image_embeds = None
-        #return prompt_embeds, l2_loss, l1_loss, pad_loss, clip_image_embeds, attn_mask
-        return prompt_embeds, clip_image_embeds, perturbed_prompt_embeds, replaced_ids, self.t5_tokenizer, batch_encoding
-import torch.func as func
-class FullJacobianLoraT5Embedder(torch.nn.Module):
-    def __init__(self, device, rank=64, max_length=512, use_gradient_checkpointing=True,
-                 num_jacobian_samples=1):
-        super().__init__()
-        self.device = device
-        self.max_length = max_length
-        self.use_gradient_checkpointing = use_gradient_checkpointing
-        self.num_jacobian_samples = num_jacobian_samples  # Number of random columns to sample
-        dtype = torch.bfloat16
-        self.dtype = dtype
-        t5_version = './t5-v1_1-xxl'
-        self.t5_tokenizer = T5Tokenizer.from_pretrained(t5_version, max_length=max_length)
-        self.t5_encoder = T5EncoderModel.from_pretrained(
-            t5_version,
-            dtype=dtype
-        ).to(device=device).to(dtype)
-        self.t5_encoder.requires_grad_(False)
-        # Add LoRA adapters to the T5 model
-        text_lora_config = LoraConfig(
-            r=rank,
-            lora_alpha=rank,
-            lora_dropout=0.0,
-            init_lora_weights="gaussian",
-            target_modules=["q", "k", "v", "o", "wi", "wo"],
-        )
-        self.t5_encoder.add_adapter(text_lora_config)
-        self.t5_encoder.encoder.embed_tokens.weight.requires_grad_(True)
-        # Manually implement gradient checkpointing for T5 encoder blocks
-        if self.use_gradient_checkpointing:
-            self._enable_gradient_checkpointing()
-        print(f"Gradient checkpointing enabled: {self.use_gradient_checkpointing}")
-        print(f"Jacobian samples per batch: {self.num_jacobian_samples}")
-        image_encoder_path = './clip-vit-large-patch14'
-        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-            image_encoder_path
-        ).to(device=device).to(torch.bfloat16)
-        self.image_encoder = self.image_encoder.eval().requires_grad_(False)
-    def compute_jacobian_loss(self, input_embeds, attention_mask):
-        """
-        Compute L1 Jacobian sparsity loss using forward-mode AD (JVP).
-        Note: Temporarily disables gradient checkpointing as it's incompatible with JVP.
-        """
-        batch_size, seq_len, hidden_dim = input_embeds.shape
-        input_embeds = input_embeds[:1]
-        attention_mask = attention_mask[:1]
-        # Temporarily disable gradient checkpointing
-        original_checkpointing = self.use_gradient_checkpointing
-        if original_checkpointing:
-            self._disable_gradient_checkpointing()
-        if True:
-            if True:
-                """
-                Compute same-token and cross-token Jacobian sparsity losses.
-                Assumes left-aligned mask: attention_mask[b] = [1...1, 0...0]
-                Probes one (token, dim) per batch element per JVP sample.
-                """
-                B, S, H = input_embeds.shape
-                device = input_embeds.device
-                # Count valid tokens per batch element
-                lengths = attention_mask.sum(dim=1)  # [B]
-                valid_batch = lengths>0
-                if valid_batch.sum() == 0:
-                    z = input_embeds.new_zeros(())
-                    return z, z
-                same_token_loss = input_embeds.new_zeros(())
-                cross_token_loss = input_embeds.new_zeros(())
-                def model_fn(embeds):
-                    return self.t5_encoder.encoder(
-                        inputs_embeds=embeds,
-                        attention_mask=None,
-                        output_hidden_states=False,
-                    ).last_hidden_state
-                batch_idx = torch.arange(B, device=device)
-                for _ in range(self.num_jacobian_samples):
-                    # Sample one valid token position per batch element
-                    t = torch.zeros(B, dtype=torch.long, device=device)
-                    u = torch.rand(B, device=device)
-                    # For valid batches: uniform over [0, lengths[b])
-                    # For invalid batches: stays 0 (doesn't matter, will be masked out)
-                    t[valid_batch] = (u[valid_batch] * lengths[valid_batch].float()).long()
-                    # Sample one hidden dim per batch element
-                    k = torch.randint(0, H, (B,), device=device)
-                    # Tangent: one scalar per batch element at position [b, t[b], k[b]]
-                    tangent = torch.zeros_like(input_embeds)
-                    tangent[batch_idx, t, k] = 1.0
-                    # JVP
-                    _, jvp = func.jvp(model_fn, (input_embeds,), (tangent,))
-                    abs_jvp = jvp.abs()  # [B, S, H]
-                    # SAME-token: diagonal element for each batch
-                    diag = abs_jvp[batch_idx, t, :]  # [B, H]
-                    same_token_loss = same_token_loss + diag[valid_batch].sum()
-                    # CROSS-token: all valid positions except diagonal
-                    # Create position mask: valid positions are [0, lengths[b])
-                    pos = torch.arange(S, device=device).unsqueeze(0)  # [1, S]
-                    valid_pos_mask = pos < lengths.unsqueeze(1)  # [B, S]
-                    # Exclude diagonal
-                    cross_mask = valid_pos_mask.clone()
-                    cross_mask[batch_idx, t] = False
-                    cross_token_loss = cross_token_loss + abs_jvp[cross_mask].sum()
-                # ---- Normalization (keep as tensors for AMP) ----
-                num_valid_batches = valid_batch.sum()  # Keep as tensor
-                # Same-token: mean per output element over (num_samples × num_valid_batches × H)
-                same_token_loss = same_token_loss / (self.num_jacobian_samples * num_valid_batches)
-                # Cross-token: mean per output element over (num_samples × total_cross_positions × H)
-                # total_cross_positions = sum over valid batches of (lengths[b] - 1)
-                cross_counts = (lengths[valid_batch] - 1).clamp(min=0).sum()  # Keep as tensor
-                if cross_counts > 0:
-                    cross_token_loss = cross_token_loss / (self.num_jacobian_samples * cross_counts)
-                else:
-                    cross_token_loss = input_embeds.new_zeros(())
-        # Re-enable gradient checkpointing
-        if original_checkpointing:
-            self._enable_gradient_checkpointing()
-        return same_token_loss, cross_token_loss
-    def _disable_gradient_checkpointing(self):
-        """Restore original forward methods without checkpointing."""
-        for block in self.t5_encoder.encoder.block:
-            if hasattr(block, '_original_forward'):
-                block.forward = block._original_forward
-    def _enable_gradient_checkpointing(self):
-        """Manually wrap T5 encoder blocks with gradient checkpointing."""
-        from torch.utils.checkpoint import checkpoint as cp
-        # Wrap each T5 block with checkpointing
-        for block in self.t5_encoder.encoder.block:
-            # Store original forward if not already stored
-            if not hasattr(block, '_original_forward'):
-                block._original_forward = block.forward
-            # Create checkpointed forward
-            def make_checkpointed_forward(blk):
-                def checkpointed_forward(*args, **kwargs):
-                    def forward_wrapper(*inputs):
-                        hidden_states = inputs[0]
-                        attention_mask = inputs[1] if len(inputs) > 1 else None
-                        position_bias = inputs[2] if len(inputs) > 2 else None
-                        return blk._original_forward(
-                            hidden_states=hidden_states,
-                            attention_mask=attention_mask,
-                            position_bias=position_bias,
-                            **{k: v for k, v in kwargs.items() if
-                               k not in ['hidden_states', 'attention_mask', 'position_bias']}
-                        )
-                    hidden_states = kwargs.get('hidden_states', args[0] if args else None)
-                    attention_mask = kwargs.get('attention_mask', args[1] if len(args) > 1 else None)
-                    position_bias = kwargs.get('position_bias', args[2] if len(args) > 2 else None)
-                    checkpoint_inputs = [hidden_states]
-                    if attention_mask is not None:
-                        checkpoint_inputs.append(attention_mask)
-                    if position_bias is not None:
-                        checkpoint_inputs.append(position_bias)
-                    return cp(
-                        forward_wrapper,
-                        *checkpoint_inputs,
-                        use_reentrant=False
-                    )
-                return checkpointed_forward
-            block.forward = make_checkpointed_forward(block)
-    def forward(self, text, image=None, compute_jacobian=False):
-        """
-        Forward pass with optional Jacobian regularization.
-        Args:
-            text: Input text (string or list of strings)
-            image: Optional image input
-            compute_jacobian: Whether to compute Jacobian loss (set False during inference)
-        Returns:
-            prompt_embeds: T5 encoder output
-            clip_image_embeds: CLIP image embeddings (if image provided)
-            jacobian_loss: Jacobian sparsity loss (if compute_jacobian=True)
-            attn_mask: Attention mask
-        """
-        if isinstance(text, str):
-            text = [text]
-        batch_encoding = self.t5_tokenizer(
-            text,
-            truncation=True,
-            max_length=self.max_length,
-            return_length=False,
-            return_overflowing_tokens=False,
-            padding="max_length",
-            return_tensors="pt",
-        )
-        attn_mask = batch_encoding["attention_mask"].to(self.device)
-        # Get input embeddings
-        input_ids = batch_encoding["input_ids"].to(self.device)
-        input_embeds = self.t5_encoder.encoder.embed_tokens(input_ids)
-        # Forward pass through encoder
-        prompt_embeds = self.t5_encoder.encoder(
-            inputs_embeds=input_embeds,
-            attention_mask=None,
-            output_hidden_states=False,
-        ).last_hidden_state
-        # Compute Jacobian loss if requested (during training)
-        jacobian_loss = {}
-        if compute_jacobian:
-            jacobian_same_loss, jacobian_cross_loss = self.compute_jacobian_loss(input_embeds, attn_mask)
-            jacobian_loss["same_token"] = jacobian_same_loss
-            jacobian_loss["cross_token"] = jacobian_cross_loss
-        else:
-            jacobian_loss['same_token'] = torch.tensor(0.0, device=self.device)
-            jacobian_loss['cross_token'] = torch.tensor(0.0, device=self.device)
-        # Encode image
-        with torch.no_grad():
-            if image is not None:
-                clip_image_embeds = self.image_encoder(image.to(self.device)).image_embeds
-            else:
-                clip_image_embeds = None
-        return prompt_embeds, clip_image_embeds, jacobian_loss, attn_mask
-import torch
-from torch import nn, func
-from typing import Optional
-from transformers import T5Tokenizer, CLIPVisionModelWithProjection
-from transformers.models.t5.modeling_t5 import T5PreTrainedModel, T5Stack, T5Config
 import torch
 from torch import nn, func
@@ -1007,101 +319,6 @@ class JacobianT5Encoder(T5PreTrainedModel):
         return hidden_states, position_bias, cache_position
-    def compute_jacobian_loss(self, second_last_output, position_bias, cache_position, attention_mask):
-        """
-        Compute L1 Jacobian sparsity loss using forward-mode AD (JVP).
-        Only computes through the last block + final layer norm.
-        attention_mask is ONLY used for sampling valid tokens, NOT for masking during forward.
-        """
-        batch_size, seq_len, hidden_dim = second_last_output.shape
-        # Use only first sample for Jacobian
-        second_last_output = second_last_output[:8]
-        position_bias_sample = position_bias[:8] if position_bias is not None else None
-        attention_mask = attention_mask[:8]
-        last_block = self.encoder.block[-1]
-        final_layer_norm = self.encoder.final_layer_norm
-        B, S, H = second_last_output.shape
-        device = second_last_output.device
-        # Use attention_mask ONLY to determine valid tokens for sampling
-        lengths = attention_mask.sum(dim=1)
-        valid_batch = lengths > 0
-        if valid_batch.sum() == 0:
-            z = second_last_output.new_zeros(())
-            return z, z
-        same_token_loss = second_last_output.new_zeros(())
-        cross_token_loss = second_last_output.new_zeros(())
-        def model_fn(embeds):
-            """Forward through ONLY the last block + final layer norm (NO MASKING)"""
-            layer_outputs = last_block(
-                embeds,
-                None,  # No attention mask - all tokens attend to all
-                position_bias_sample,
-                None, None, None,
-                past_key_values=None,
-                use_cache=False,
-                output_attentions=False,
-                return_dict=True,
-                cache_position=cache_position,
-            )
-            hidden = layer_outputs[0]
-            hidden = final_layer_norm(hidden)
-            hidden = self.encoder.dropout(hidden)
-            return hidden
-        batch_idx = torch.arange(B, device=device)
-        for _ in range(self.num_jacobian_samples):
-            # Sample one valid token position per batch element
-            # Use attention_mask to know which tokens are valid (not padding)
-            t = torch.zeros(B, dtype=torch.long, device=device)
-            u = torch.rand(B, device=device)
-            t[valid_batch] = (u[valid_batch] * lengths[valid_batch].float()).long()
-            # Sample one hidden dim per batch element
-            k = torch.randint(0, H, (B,), device=device)
-            # Tangent: one scalar per batch element at position [b, t[b], k[b]]
-            tangent = torch.zeros_like(second_last_output)
-            tangent[batch_idx, t, k] = 1.0
-            # JVP through ONLY the last block
-            _, jvp = func.jvp(model_fn, (second_last_output,), (tangent,))
-            abs_jvp = jvp.abs()
-            # SAME-token: diagonal element for each batch
-            diag = abs_jvp[batch_idx, t, :]
-            same_token_loss = same_token_loss + diag[valid_batch].sum()
-            # CROSS-token: all valid positions except diagonal
-            # Use attention_mask to know which positions are valid
-            pos = torch.arange(S, device=device).unsqueeze(0)
-            valid_pos_mask = pos < lengths.unsqueeze(1)
-            # Exclude diagonal
-            cross_mask = valid_pos_mask.clone()
-            cross_mask[batch_idx, t] = False
-            cross_token_loss = cross_token_loss + abs_jvp[cross_mask].sum()
-        # Normalization
-        num_valid_batches = valid_batch.sum()
-        same_token_loss = same_token_loss / (self.num_jacobian_samples * num_valid_batches)
-        cross_counts = (lengths[valid_batch] - 1).clamp(min=0).sum()
-        if cross_counts > 0:
-            cross_token_loss = cross_token_loss / (self.num_jacobian_samples * cross_counts)
-        else:
-            cross_token_loss = second_last_output.new_zeros(())
-        return same_token_loss, cross_token_loss
     def forward(
             self,
@@ -1180,20 +397,7 @@ class JacobianT5Encoder(T5PreTrainedModel):
                 hidden_states = self.encoder.dropout(hidden_states)
         # Compute Jacobian loss if requested
-        jacobian_loss = {}
-        if compute_jacobian:
-            jacobian_same_loss, jacobian_cross_loss = self.compute_jacobian_loss(
-                second_last_output,
-                position_bias,
-                cache_position,
-                attention_mask  # Used ONLY for sampling valid tokens
-            )
-            jacobian_loss = {
-                "same_token": jacobian_same_loss,
-                "cross_token": jacobian_cross_loss
-            }
-        else:
-            jacobian_loss = {
                 "same_token": torch.tensor(0.0, device=input_ids.device),
                 "cross_token": torch.tensor(0.0, device=input_ids.device)
             }
@@ -1212,11 +416,11 @@ class JacobianLoraT5Embedder(nn.Module):
         # Load T5 config
         from transformers import T5Config
-        config = T5Config.from_pretrained('./t5-v1_1-xxl')
         # Create encoder model
         self.t5_encoder = JacobianT5Encoder.from_pretrained(
-            './t5-v1_1-xxl',
             config=config,
             num_jacobian_samples=num_jacobian_samples,
             max_length=max_length
@@ -1224,18 +428,15 @@ class JacobianLoraT5Embedder(nn.Module):
         self.dtype = torch.bfloat16
         # Tokenizer
-        self.t5_tokenizer = T5Tokenizer.from_pretrained('./t5-v1_1-xxl', max_length=max_length)
         # Image encoder
-        image_encoder_path = './clip-vit-large-patch14'
         self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
             image_encoder_path
         ).to(device=device).to(torch.bfloat16)
         self.image_encoder = self.image_encoder.eval().requires_grad_(False)
-        print(f"Gradient checkpointing: {use_gradient_checkpointing} (using T5's built-in)")
-        print(f"Jacobian samples per batch: {num_jacobian_samples}")
-        print(f"NO ATTENTION MASKING during forward pass - all tokens attend to all tokens")
     def forward(self, text, image=None, compute_jacobian=False):
         """
@@ -1285,217 +486,3 @@ class JacobianLoraT5Embedder(nn.Module):
         return prompt_embeds, clip_image_embeds, jacobian_loss, attn_mask
-import gc
-from PIL import Image
-from transformers import AutoProcessor
-import numpy as np
-def get_gpu_memory():
-    """Get current GPU memory usage in MB"""
-    return torch.cuda.memory_allocated() / 1024 ** 2
-def get_peak_memory():
-    """Get peak GPU memory usage in MB"""
-    return torch.cuda.max_memory_allocated() / 1024 ** 2
-def reset_peak_memory():
-    """Reset peak memory counter"""
-    torch.cuda.reset_peak_memory_stats()
-def clear_memory():
-    """Clear GPU cache and run garbage collection"""
-    gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.reset_peak_memory_stats()
-def test_memory_usage():
-    """Test memory usage with and without Jacobian loss"""
-    # Initialize model
-    print("=" * 80)
-    print("Initializing model...")
-    clear_memory()
-    model = JacobianLoraT5Embedder(
-        device="cuda:0",
-        use_gradient_checkpointing=True,
-        num_jacobian_samples=10
-    )
-    clip_processor = AutoProcessor.from_pretrained("./clip-vit-large-patch14", use_fast=True)
-    init_memory = get_gpu_memory()
-    print(f"Memory after model init: {init_memory:.2f} MB")
-    print("=" * 80)
-    # Prepare inputs
-    image = Image.open('example512.jpg').convert('RGB')
-    prompt = """A heartwarming 3D rendered scene of
-an elderly farmer and a tiny orange
-kitten. The farmer, with a gentle smile,
-walks alongside the kitten in a lush,
-green garden filled with thriving plants,
-showcasing a fruitful harvest. The
-intricate details of the overalls and the
-farmer's worn, weathered face tell a
-story of years spent tending to the land, the farmer is wearing a blue shirt"""
-    # Test different batch sizes
-    batch_sizes = [1, 2, 5, 10]
-    results = []
-    for batch_size in batch_sizes:
-        print(f"\n{'=' * 80}")
-        print(f"BATCH SIZE: {batch_size}")
-        print(f"{'=' * 80}")
-        text_batch = [prompt] * batch_size
-        pixel_values = clip_processor(
-            images=image,
-            return_tensors="pt"
-        ).pixel_values.to("cuda:0").to(torch.bfloat16)
-        # Test WITHOUT Jacobian
-        print(f"\n--- WITHOUT Jacobian Loss ---")
-        clear_memory()
-        reset_peak_memory()
-        mem_before = get_gpu_memory()
-        print(f"Memory before forward: {mem_before:.2f} MB")
-        with torch.no_grad():
-            prompt_embeds, clip_image_embeds, jacobian_loss, attn_mask = model(
-                text_batch,
-                image=pixel_values,
-                compute_jacobian=False
-            )
-        mem_after = get_gpu_memory()
-        peak_mem = get_peak_memory()
-        print(f"Memory after forward: {mem_after:.2f} MB")
-        print(f"Peak memory: {peak_mem:.2f} MB")
-        print(f"Memory increase: {mem_after - mem_before:.2f} MB")
-        print(f"Peak increase: {peak_mem - mem_before:.2f} MB")
-        no_jac_peak = peak_mem - mem_before
-        # Clean up
-        del prompt_embeds, clip_image_embeds, jacobian_loss, attn_mask
-        # Test WITH Jacobian (requires grad)
-        print(f"\n--- WITH Jacobian Loss ---")
-        clear_memory()
-        reset_peak_memory()
-        mem_before = get_gpu_memory()
-        print(f"Memory before forward: {mem_before:.2f} MB")
-        try:
-            prompt_embeds, clip_image_embeds, jacobian_loss, attn_mask = model(
-                text_batch,
-                image=pixel_values,
-                compute_jacobian=True
-            )
-            mem_after = get_gpu_memory()
-            peak_mem = get_peak_memory()
-            print(f"Memory after forward: {mem_after:.2f} MB")
-            print(f"Peak memory: {peak_mem:.2f} MB")
-            print(f"Memory increase: {mem_after - mem_before:.2f} MB")
-            print(f"Peak increase: {peak_mem - mem_before:.2f} MB")
-            if jacobian_loss is not None:
-                print(f"\nJacobian Loss Values:")
-                print(f"  Same-token loss: {jacobian_loss['same_token'].item():.6f}")
-                print(f"  Cross-token loss: {jacobian_loss['cross_token'].item():.6f}")
-            with_jac_peak = peak_mem - mem_before
-            print(f"\n{'*' * 60}")
-            print(f"JACOBIAN OVERHEAD: {with_jac_peak - no_jac_peak:.2f} MB")
-            print(f"MEMORY MULTIPLIER: {with_jac_peak / no_jac_peak:.2f}x")
-            print(f"{'*' * 60}")
-            results.append({
-                'batch_size': batch_size,
-                'no_jacobian_mb': no_jac_peak,
-                'with_jacobian_mb': with_jac_peak,
-                'overhead_mb': with_jac_peak - no_jac_peak,
-                'multiplier': with_jac_peak / no_jac_peak
-            })
-        except RuntimeError as e:
-            print(f"❌ CUDA OUT OF MEMORY with Jacobian at batch_size={batch_size}")
-            print(f"Error: {str(e)}")
-            results.append({
-                'batch_size': batch_size,
-                'no_jacobian_mb': no_jac_peak,
-                'with_jacobian_mb': float('inf'),
-                'overhead_mb': float('inf'),
-                'multiplier': float('inf')
-            })
-        # Clean up
-        del prompt_embeds, clip_image_embeds, jacobian_loss, attn_mask
-        clear_memory()
-    # Print summary table
-    print(f"\n\n{'=' * 80}")
-    print("SUMMARY TABLE")
-    print(f"{'=' * 80}")
-    print(f"{'Batch':>6} | {'No Jacobian':>12} | {'With Jacobian':>14} | {'Overhead':>10} | {'Multiplier':>10}")
-    print(f"{'Size':>6} | {'(MB)':>12} | {'(MB)':>14} | {'(MB)':>10} | {'':>10}")
-    print(f"{'-' * 80}")
-    for r in results:
-        batch = r['batch_size']
-        no_jac = r['no_jacobian_mb']
-        with_jac = r['with_jacobian_mb']
-        overhead = r['overhead_mb']
-        mult = r['multiplier']
-        if overhead == float('inf'):
-            print(f"{batch:>6} | {no_jac:>11.2f}  | {'OOM':>14} | {'OOM':>10} | {'OOM':>10}")
-        else:
-            print(f"{batch:>6} | {no_jac:>11.2f}  | {with_jac:>13.2f}  | {overhead:>9.2f}  | {mult:>9.2f}x")
-    print(f"{'=' * 80}")
-    # Comparison with original
-    print(f"\n\n{'=' * 80}")
-    print("COMPARISON WITH ORIGINAL IMPLEMENTATION")
-    print(f"{'=' * 80}")
-    print("\nORIGINAL (all 24 blocks in Jacobian):")
-    print("  Batch 1:  30,900 MB overhead, 144x multiplier")
-    print("  Batch 10: 30,328 MB overhead,  15x multiplier")
-    print("\nNEW (only last block in Jacobian):")
-    if len(results) > 0:
-        r1 = results[0]
-        r10 = results[-1] if len(results) >= 4 else results[-1]
-        print(f"  Batch 1:  {r1['overhead_mb']:>6.0f} MB overhead, {r1['multiplier']:>4.1f}x multiplier")
-        print(f"  Batch 10: {r10['overhead_mb']:>6.0f} MB overhead, {r10['multiplier']:>4.1f}x multiplier")
-        if r1['overhead_mb'] != float('inf'):
-            reduction = 30900 / r1['overhead_mb']
-            print(f"\n🎉 MEMORY REDUCTION: {reduction:.1f}x improvement!")
-    print(f"{'=' * 80}")
-if __name__ == "__main__":
-    # Set random seed for reproducibility
-    torch.manual_seed(42)
-    np.random.seed(42)
-    # Run test
-    test_memory_usage()

 from torch.utils.checkpoint import checkpoint
 from peft import LoraConfig, set_peft_model_state_dict
 import torch
 from torch import nn, func
         return hidden_states, position_bias, cache_position
     def forward(
             self,
                 hidden_states = self.encoder.dropout(hidden_states)
         # Compute Jacobian loss if requested
+        jacobian_loss = {
                 "same_token": torch.tensor(0.0, device=input_ids.device),
                 "cross_token": torch.tensor(0.0, device=input_ids.device)
             }
         # Load T5 config
         from transformers import T5Config
+        config = T5Config.from_pretrained('google/t5-v1_1-xxl')
         # Create encoder model
         self.t5_encoder = JacobianT5Encoder.from_pretrained(
+            'google/t5-v1_1-xxl',
             config=config,
             num_jacobian_samples=num_jacobian_samples,
             max_length=max_length
         self.dtype = torch.bfloat16
         # Tokenizer
+        self.t5_tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-xxl', max_length=max_length)
         # Image encoder
+        image_encoder_path = 'openai/clip-vit-large-patch14'
         self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
             image_encoder_path
         ).to(device=device).to(torch.bfloat16)
         self.image_encoder = self.image_encoder.eval().requires_grad_(False)
     def forward(self, text, image=None, compute_jacobian=False):
         """
         return prompt_embeds, clip_image_embeds, jacobian_loss, attn_mask