Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

modeling_safellava.py +1 -1
safellava/model/language_model/safe_llava_llama.py +281 -89

modeling_safellava.py CHANGED Viewed

@@ -8,7 +8,7 @@ SafeLLaVA adds image safety classification capabilities to LLaVA.
 """
 # Re-export classes from safellava package for HuggingFace auto_map
-from safellava.model.language_model.safe_llava_llama_pool import (
     SafetyConfig,
     SafeLlavaLlamaForCausalLM,
     SafetyCausalLMOutputWithPast,

 """
 # Re-export classes from safellava package for HuggingFace auto_map
+from safellava.model.language_model.safe_llava_llama import (
     SafetyConfig,
     SafeLlavaLlamaForCausalLM,
     SafetyCausalLMOutputWithPast,

safellava/model/language_model/safe_llava_llama.py CHANGED Viewed

@@ -1,10 +1,3 @@
-"""
-Based on LLaVA v1.5: https://github.com/haotian-liu/LLaVA
-Modified for SafeLLaVA
-Original LLaVA License: Apache License 2.0
-"""
 from typing import List, Optional, Tuple, Union, Dict
 import torch
@@ -12,15 +5,15 @@ import torch.nn as nn
 from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.modeling_outputs import CausalLMOutputWithPast
-from safellava.model.language_model.llava_llama import (
     LlavaConfig, LlavaLlamaModel, LlavaLlamaForCausalLM
 )
-from safellava.constants import IMAGE_TOKEN_INDEX
 from dataclasses import dataclass
 import logging
-from safellava.utils import setup_simple_logging
 setup_simple_logging()
@@ -65,7 +58,7 @@ class SafetyMLP(nn.Module):
 class SafetyConfig(LlavaConfig):
-    """Safety-aware configuration for pooling version """
     model_type = "safe_llava_llama"
     def __init__(
@@ -115,18 +108,19 @@ class SafetyConfig(LlavaConfig):
         self.safety_head_hidden_scale = safety_head_hidden_scale
         self.pooling_method = pooling_method
-        # self.use_img_safety_meta_token = False
         self.use_txt_safety_meta_token = False
         self.use_total_safety_meta_token = False
 class SafeLlavaLlamaForCausalLM(LlavaLlamaForCausalLM):
     """
-    SafeLLaVA: A simplified version Uses pooled visual features for safety classification.
     """
     config_class = SafetyConfig
-    _keys_to_ignore_on_load_unexpected = []  # Don't ignore img_safety_head weights
     def __init__(self, config: SafetyConfig):
         super().__init__(config)
@@ -138,7 +132,7 @@ class SafeLlavaLlamaForCausalLM(LlavaLlamaForCausalLM):
             output_size=len(config.safety_categories),
             safety_num_hidden_layers=config.safety_num_hidden_layers
         )
-        logging.info("Created img_safety_head for SafeLLaVA")
         # Store pooling method
         self.pooling_method = config.pooling_method
@@ -153,79 +147,6 @@ class SafeLlavaLlamaForCausalLM(LlavaLlamaForCausalLM):
     def get_model(self):
         return self.model
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
-        """
-        Custom from_pretrained to properly load img_safety_head weights.
-        """
-        import os
-        import torch
-        from pathlib import Path
-        # Load the model normally first
-        model = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-        # List of original LLaVA model names
-        original_llava_models = [
-            "liuhaotian/llava-v1.5-7b",
-            "liuhaotian/llava-v1.5-13b",
-        ]
-        is_original_llava = any(str(pretrained_model_name_or_path).startswith(name) for name in original_llava_models)
-        # Load safety head weights for SafeLLaVA models
-        if not is_original_llava:
-            logging.info(f"Detected SafeLLaVA model: {pretrained_model_name_or_path}")
-            model_path = Path(pretrained_model_name_or_path)
-            # Handle both local paths and HuggingFace Hub
-            if not model_path.exists():
-                # Try HuggingFace cache
-                from huggingface_hub import snapshot_download
-                try:
-                    model_path = Path(snapshot_download(repo_id=str(pretrained_model_name_or_path)))
-                    logging.info(f"Downloaded from HuggingFace Hub to: {model_path}")
-                except Exception as e:
-                    logging.warning(f"Could not download from Hub: {e}")
-                    return model
-            if model_path.exists():
-                # Load safety head weights from safetensors
-                safetensors_index_path = model_path / "model.safetensors.index.json"
-                if safetensors_index_path.exists():
-                    logging.info("Loading safety head weights from safetensors...")
-                    from safetensors.torch import load_file
-                    import json
-                    # Load the index file
-                    with open(safetensors_index_path, 'r') as f:
-                        index_data = json.load(f)
-                    # Load all safetensors files and collect safety head weights
-                    safety_weights = {}
-                    for weight_map in set(index_data.get('weight_map', {}).values()):
-                        safetensors_file = model_path / weight_map
-                        if safetensors_file.exists():
-                            file_weights = load_file(str(safetensors_file))
-                            # Extract only img_safety_head weights
-                            for key, value in file_weights.items():
-                                if key.startswith('img_safety_head.'):
-                                    safety_weights[key] = value
-                    if safety_weights:
-                        logging.info(f"Found {len(safety_weights)} img_safety_head weights")
-                        # Load the weights
-                        missing_keys, unexpected_keys = model.load_state_dict(safety_weights, strict=False)
-                        logging.info("✅ Safety head weights loaded successfully")
-                    else:
-                        logging.warning("⚠️  No img_safety_head weights found in checkpoint")
-                else:
-                    logging.warning(f"No safetensors index found at {safetensors_index_path}")
-            else:
-                logging.warning(f"Model path does not exist: {model_path}")
-        return model
     def get_safety_warning(self, unsafe_categories):
         if len(unsafe_categories) == 1:
             category_str = f"related to {unsafe_categories[0]}"
@@ -314,6 +235,277 @@ class SafeLlavaLlamaForCausalLM(LlavaLlamaForCausalLM):
         pooled_features = torch.stack(pooled_features, dim=0)
         return pooled_features
     def forward(
         self,
         input_ids=None,
@@ -332,7 +524,7 @@ class SafeLlavaLlamaForCausalLM(LlavaLlamaForCausalLM):
         **kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast, SafetyCausalLMOutputWithPast]:
         """
-        Forward method for SafeLLaVA.
         When do_safety=True, extracts and pools visual tokens for safety classification.
         """

 from typing import List, Optional, Tuple, Union, Dict
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.modeling_outputs import CausalLMOutputWithPast
+from llava.model.language_model.llava_llama import (
     LlavaConfig, LlavaLlamaModel, LlavaLlamaForCausalLM
 )
+from llava.constants import IMAGE_TOKEN_INDEX
 from dataclasses import dataclass
 import logging
+from llava.utils import setup_simple_logging
 setup_simple_logging()
 class SafetyConfig(LlavaConfig):
+    """Safety-aware configuration for pooling version without meta tokens"""
     model_type = "safe_llava_llama"
     def __init__(
         self.safety_head_hidden_scale = safety_head_hidden_scale
         self.pooling_method = pooling_method
+        # Pool version doesn't use meta tokens
+        self.use_img_safety_meta_token = False
         self.use_txt_safety_meta_token = False
         self.use_total_safety_meta_token = False
 class SafeLlavaLlamaForCausalLM(LlavaLlamaForCausalLM):
     """
+    SafeLLaVA-Pool: A simplified version without meta tokens.
+    Pools visual tokens directly for safety classification.
     """
     config_class = SafetyConfig
     def __init__(self, config: SafetyConfig):
         super().__init__(config)
             output_size=len(config.safety_categories),
             safety_num_hidden_layers=config.safety_num_hidden_layers
         )
+        logging.info("Created img_safety_head for SafeLLaVA-Pool")
         # Store pooling method
         self.pooling_method = config.pooling_method
     def get_model(self):
         return self.model
     def get_safety_warning(self, unsafe_categories):
         if len(unsafe_categories) == 1:
             category_str = f"related to {unsafe_categories[0]}"
         pooled_features = torch.stack(pooled_features, dim=0)
         return pooled_features
+    def compute_gradcam(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        images=None,
+        image_sizes=None,
+        target_class=None,
+        use_pre_pooling=False,
+        **kwargs,
+    ):
+        """
+        Compute Grad-CAM for the image safety classification.
+        Args:
+            input_ids: Input token IDs
+            attention_mask: Attention mask
+            images: Input images tensor [batch_size, 3, H, W]
+            image_sizes: Image sizes
+            target_class: Target class index for Grad-CAM. If None, uses the predicted class.
+            use_pre_pooling: If True, compute Grad-CAM before pooling for better spatial resolution
+        Returns:
+            dict with keys:
+                - 'heatmap': Grad-CAM heatmap [batch_size, H_feat, W_feat]
+                - 'predicted_class': Predicted class index
+                - 'predicted_prob': Probability of predicted class
+                - 'class_name': Name of the target class
+        """
+        if images is None:
+            raise ValueError("Images are required for Grad-CAM computation")
+        # Enable gradient computation for images
+        # Note: We need to enable train mode for vision tower to compute gradients
+        was_training = self.training
+        was_vision_training = self.get_vision_tower().training
+        # Set vision tower to train mode to enable gradients
+        vision_tower = self.get_vision_tower()
+        vision_tower.train()
+        # CRITICAL: Enable gradients for vision tower parameters
+        # This is necessary because merged LoRA models might have frozen parameters
+        for param in vision_tower.parameters():
+            param.requires_grad = True
+        # Note: We keep model in eval mode for other components (dropout, batchnorm)
+        # but vision tower is in train mode for gradient computation
+        # Ensure images require grad
+        if not images.requires_grad:
+            images = images.clone().detach().requires_grad_(True)
+        logging.info(f"Images requires_grad: {images.requires_grad}")
+        # Store activations and gradients for Grad-CAM
+        activations = []
+        gradients = []
+        def save_gradient(grad):
+            """Backward hook to capture gradients"""
+            logging.info(f"Gradient hook called! Grad shape: {grad.shape}")
+            gradients.append(grad.detach())
+        def forward_hook(module, input, output):
+            """Forward hook to save activations and register backward hook"""
+            if isinstance(output, tuple):
+                activation = output[0]
+            else:
+                activation = output
+            logging.info(f"Forward hook: activation shape={activation.shape}, requires_grad={activation.requires_grad}")
+            # Register backward hook on the activation tensor itself BEFORE saving
+            if activation.requires_grad:
+                activation.register_hook(save_gradient)
+                logging.info("Registered backward hook on activation")
+            else:
+                logging.warning("Activation does not require grad, cannot register backward hook!")
+            # Save activation (keep gradient connection for now, will detach later if needed)
+            activations.append(activation)
+        # Register hook on vision tower
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None:
+            raise AttributeError("Vision tower not found")
+        hook_handle = vision_tower.register_forward_hook(forward_hook)
+        try:
+            # Forward pass - Do normal forward but intercept and modify vision features
+            # CRITICAL: Use autograd.enable_grad() to force gradient tracking
+            # Store original vision tower forward
+            vision_tower = self.get_vision_tower()
+            original_forward = vision_tower.forward
+            # Create a wrapper that forces requires_grad on output
+            def forward_with_grad(*args, **kwargs):
+                output = original_forward(*args, **kwargs)
+                if not output.requires_grad:
+                    output = output.clone().requires_grad_(True)
+                    # Register hook on this tensor
+                    output.register_hook(save_gradient)
+                    # Save to activations
+                    activations.append(output)
+                return output
+            # Temporarily replace forward
+            vision_tower.forward = forward_with_grad
+            try:
+                with torch.enable_grad():
+                    if use_pre_pooling:
+                        # For pre-pooling Grad-CAM, we need to capture the visual tokens from hidden_states
+                        # before they are pooled
+                        pre_pool_activations = []
+                        pre_pool_gradients = []
+                        def save_pre_pool_gradient(grad):
+                            pre_pool_gradients.append(grad)
+                        # Store original pool_visual_tokens method
+                        original_pool_method = self.pool_visual_tokens
+                        # Replace with a wrapper that captures pre-pooling features
+                        def pool_with_capture(hidden_states, input_ids, images):
+                            # Extract visual tokens before pooling
+                            # Visual tokens are typically in the positions where image tokens were
+                            batch_size = hidden_states.shape[0]
+                            # Find image token positions
+                            # The image token index is -200 by default in LLaVA
+                            IMAGE_TOKEN_INDEX = -200
+                            image_token_indices = []
+                            for batch_idx in range(batch_size):
+                                image_positions = (input_ids[batch_idx] == IMAGE_TOKEN_INDEX).nonzero(as_tuple=True)[0]
+                                if len(image_positions) > 0:
+                                    image_token_indices.append(image_positions)
+                            # Extract visual features before pooling
+                            if len(image_token_indices) > 0:
+                                visual_features = hidden_states[0, image_token_indices[0]]  # [num_patches, hidden_dim]
+                                visual_features = visual_features.clone().requires_grad_(True)
+                                pre_pool_activations.append(visual_features)
+                                visual_features.register_hook(save_pre_pool_gradient)
+                            # Call original pooling method
+                            return original_pool_method(hidden_states, input_ids, images)
+                        # Temporarily replace the pooling method
+                        self.pool_visual_tokens = pool_with_capture
+                    # Now do the full forward pass
+                    outputs = self.forward(
+                        input_ids=input_ids,
+                        attention_mask=attention_mask,
+                        images=images,
+                        image_sizes=image_sizes,
+                        do_safety=True,
+                        return_dict=True,
+                        **kwargs
+                    )
+                    img_safety_logits = outputs.img_safety_logits
+                    img_safety_probs = outputs.img_safety_probs
+                    if use_pre_pooling:
+                        # Restore original pooling method
+                        self.pool_visual_tokens = original_pool_method
+            finally:
+                # Restore original forward
+                vision_tower.forward = original_forward
+                # Get predicted class if not specified
+                if target_class is None:
+                    # Use the class with highest probability
+                    target_class = img_safety_probs.argmax(dim=-1)
+                else:
+                    # Ensure target_class is a tensor
+                    if isinstance(target_class, int):
+                        target_class = torch.tensor([target_class], device=img_safety_probs.device)
+                # Get the logit for the target class
+                batch_size = img_safety_probs.shape[0]
+                target_logits = img_safety_logits[torch.arange(batch_size), target_class]
+                # Backward pass to compute gradients
+                self.zero_grad()
+                target_logits.sum().backward()
+                # Choose which activations and gradients to use
+                if use_pre_pooling:
+                    # Use pre-pooling features for better spatial resolution
+                    if 'pre_pool_activations' not in locals() or len(pre_pool_activations) == 0:
+                        raise RuntimeError("Failed to capture pre-pooling activations")
+                    if 'pre_pool_gradients' not in locals() or len(pre_pool_gradients) == 0:
+                        raise RuntimeError("Failed to capture pre-pooling gradients")
+                    # Get the pre-pooling features
+                    # These have spatial structure: [num_patches, hidden_dim]
+                    activation = pre_pool_activations[0].detach()
+                    gradient = pre_pool_gradients[0]
+                    # Add batch dimension if needed for consistency
+                    if activation.dim() == 2:
+                        activation = activation.unsqueeze(0)  # [1, num_patches, hidden_dim]
+                        gradient = gradient.unsqueeze(0)
+                else:
+                    # Use post-pooling features (original behavior - from vision tower)
+                    if len(activations) == 0:
+                        raise RuntimeError("Failed to capture activations")
+                    if len(gradients) == 0:
+                        raise RuntimeError("Failed to capture gradients")
+                    activation = activations[0].detach()  # [batch_size, num_patches, hidden_dim]
+                    gradient = gradients[0]               # [batch_size, num_patches, hidden_dim]
+                # Compute Grad-CAM with correct formula
+                # For Vision Transformer: gradients and activations are [batch, num_patches, hidden_dim]
+                # Standard Grad-CAM: compute importance by averaging gradients across hidden dimension
+                # Then weight the activations
+                # Option 1: Standard Grad-CAM - use gradient magnitude as importance
+                # This captures which patches have the strongest gradient signal
+                cam = (gradient * activation).sum(dim=-1)  # [batch_size, num_patches]
+                # Alternative would be:
+                # weights = gradient.mean(dim=1, keepdim=True)  # Average across patches
+                # cam = (activation * weights).sum(dim=-1)
+                # Apply ReLU (only positive contributions)
+                cam = torch.nn.functional.relu(cam)
+                # Reshape to 2D spatial grid
+                # CLIP ViT-L/14-336px has 24x24 patches
+                num_patches_per_side = int(cam.shape[1] ** 0.5)
+                cam = cam.reshape(batch_size, num_patches_per_side, num_patches_per_side)
+                # Normalize to [0, 1]
+                for i in range(batch_size):
+                    cam_min = cam[i].min()
+                    cam_max = cam[i].max()
+                    if cam_max > cam_min:
+                        cam[i] = (cam[i] - cam_min) / (cam_max - cam_min)
+                # Get class names
+                if isinstance(target_class, torch.Tensor):
+                    target_class_idx = target_class[0].item()
+                else:
+                    target_class_idx = target_class
+                class_name = self.config.safety_categories[target_class_idx]
+                return {
+                    'heatmap': cam.detach().cpu().numpy(),
+                    'predicted_class': target_class.cpu().numpy() if isinstance(target_class, torch.Tensor) else target_class,
+                    'predicted_prob': img_safety_probs[torch.arange(batch_size), target_class].detach().cpu().numpy(),
+                    'class_name': class_name,
+                    'all_probs': img_safety_probs.detach().cpu().numpy()
+                }
+        finally:
+            # Remove hook
+            hook_handle.remove()
+            # Restore training state
+            if not was_vision_training:
+                self.get_vision_tower().eval()
+            if was_training:
+                self.train()
     def forward(
         self,
         input_ids=None,
         **kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast, SafetyCausalLMOutputWithPast]:
         """
+        Forward method for SafeLLaVA-Pool.
         When do_safety=True, extracts and pools visual tokens for safety classification.
         """