Spaces:

saakshigupta
/

deepfake-explainer-app

Paused

App Files Files Community

saakshigupta commited on Apr 6

Commit

a9f2f98

verified ·

1 Parent(s): 25d0259

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -97

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
 from torchvision import transforms
-import torchvision.models as models
 from PIL import Image
 import numpy as np
 import io
@@ -64,7 +64,7 @@ max_tokens = st.sidebar.slider(
 # Custom instruction text area in sidebar
 custom_instruction = st.sidebar.text_area(
     "Custom Instructions (Advanced)",
-    value="Focus on analyzing the highlighted regions from the visualization. Examine facial inconsistencies, lighting irregularities, and other artifacts visible in the heat map.",
     help="Add specific instructions for the LLM analysis"
 )
@@ -73,8 +73,8 @@ st.sidebar.markdown("---")
 st.sidebar.subheader("About")
 st.sidebar.markdown("""
 This analyzer performs multi-stage detection:
-1. **Initial Detection**: Face detection and feature extraction
-2. **Visualization**: Highlights suspicious regions
 3. **LLM Analysis**: Fine-tuned Llama 3.2 Vision provides detailed explanations
 The system looks for:
@@ -86,7 +86,7 @@ The system looks for:
 - Blending problems
 """)
-# ----- Face Detection and Image Processing -----
 class ImageDataset(torch.utils.data.Dataset):
     def __init__(self, image, transform=None, face_only=True, dataset_name=None):
@@ -163,8 +163,6 @@ class ImageDataset(torch.utils.data.Dataset):
             return image_tensor, label, "uploaded_image", original_image, None, self.dataset_name
-# ----- GradCAM Implementation with ResNet -----
 class GradCAM:
     def __init__(self, model, target_layer):
         self.model = model
@@ -175,42 +173,75 @@ class GradCAM:
     def _register_hooks(self):
         def forward_hook(module, input, output):
-            self.activations = output
         def backward_hook(module, grad_in, grad_out):
-            self.gradients = grad_out[0]
-        target_layer = self.target_layer
-        target_layer.register_forward_hook(forward_hook)
-        target_layer.register_backward_hook(backward_hook)
-    def generate(self, input_tensor, class_idx=0):
         self.model.zero_grad()
         try:
-            # Forward pass
-            output = self.model(input_tensor)
-            # Create a one-hot tensor for the desired class
-            one_hot = torch.zeros(output.size(), device=input_tensor.device)
-            one_hot[0, class_idx] = 1
-            # Backward pass
-            output.backward(gradient=one_hot, retain_graph=True)
-            # Get gradients and activations
-            gradients = self.gradients.cpu().detach().numpy()[0]
-            activations = self.activations.cpu().detach().numpy()[0]
-            # Calculate weights (global average pooling)
-            weights = np.mean(gradients, axis=(1, 2))
-            # Calculate CAM
-            cam = np.zeros(activations.shape[1:], dtype=np.float32)
-            for i, w in enumerate(weights):
-                cam += w * activations[i, :, :]
-            # Apply ReLU and normalize
             cam = np.maximum(cam, 0)
             if np.max(cam) > 0:
                 cam = cam / np.max(cam)
@@ -219,8 +250,7 @@ class GradCAM:
         except Exception as e:
             st.error(f"Error in GradCAM.generate: {str(e)}")
-            # Return a default heatmap
-            return np.ones((7, 7), dtype=np.float32) * 0.5
 def overlay_cam_on_image(image, cam, face_box=None, alpha=0.5):
     """Overlay the CAM on the image"""
@@ -289,7 +319,7 @@ def save_comparison(image, cam, overlay, face_box=None):
     else:
         cam_resized = cv2.resize(cam, (image.width, image.height))
         axes[1].imshow(cam_resized, cmap="jet")
-    axes[1].set_title("Heatmap")
     axes[1].axis("off")
     # Overlay
@@ -306,27 +336,31 @@ def save_comparison(image, cam, overlay, face_box=None):
     buf.seek(0)
     return Image.open(buf)
-# Function to load ResNet model for feature extraction
 @st.cache_resource
-def load_resnet_model():
-    with st.spinner("Loading ResNet model for visualization..."):
-        # Load a pretrained ResNet
-        model = models.resnet50(pretrained=True)
-        # Set to evaluation mode
         model.eval()
         return model
-def get_target_layer_resnet(model):
-    """Get the target layer for GradCAM in ResNet"""
-    return model.layer4[-1]
-def process_image_with_gradcam(image, model, device, class_idx=0):
-    """Process an image with GradCAM using ResNet"""
     # Set up transformations
     transform = transforms.Compose([
         transforms.Resize((224, 224)),
         transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
     ])
     # Create dataset for the single image
@@ -361,11 +395,11 @@ def process_image_with_gradcam(image, model, device, class_idx=0):
         try:
             # Create GradCAM extractor
-            target_layer = get_target_layer_resnet(model)
             cam_extractor = GradCAM(model, target_layer)
             # Generate CAM
-            cam = cam_extractor.generate(input_tensor, class_idx)
             # Create visualizations
             overlay = overlay_cam_on_image(original_image, cam, face_box)
@@ -377,22 +411,11 @@ def process_image_with_gradcam(image, model, device, class_idx=0):
         except Exception as e:
             st.error(f"Error processing image with GradCAM: {str(e)}")
             # Return default values
-            default_cam = np.ones((7, 7), dtype=np.float32) * 0.5
             overlay = overlay_cam_on_image(original_image, default_cam, face_box)
             comparison = save_comparison(original_image, default_cam, overlay, face_box)
             return default_cam, overlay, comparison, face_box
-# ----- Face Analysis Functions -----
-def analyze_face(face_tensor, device):
-    """Simple face analysis to determine if real or fake (simplified)"""
-    # This is a placeholder function - in a real app, you would use a trained classifier
-    # We'll return random values for now
-    rand_val = np.random.random()
-    is_fake = rand_val > 0.5
-    confidence = rand_val if is_fake else 1 - rand_val
-    return "Fake" if is_fake else "Real", confidence
 # ----- Fine-tuned Vision LLM -----
 # Function to fix cross-attention masks
@@ -437,9 +460,9 @@ def load_llm_model():
 def analyze_image_with_llm(image, gradcam_overlay, face_box, pred_label, confidence, question, model, tokenizer, temperature=0.7, max_tokens=500, custom_instruction=""):
     # Create a prompt that includes GradCAM information
     if custom_instruction.strip():
-        full_prompt = f"{question}\n\nThe image has been preliminarily classified as {pred_label} with confidence {confidence:.2f}. Focus on the highlighted regions in red/yellow which show areas of interest for analysis.\n\n{custom_instruction}"
     else:
-        full_prompt = f"{question}\n\nThe image has been preliminarily classified as {pred_label} with confidence {confidence:.2f}. Focus on the highlighted regions in red/yellow which show areas of interest for analysis."
     # Format the message to include both the original image and the GradCAM visualization
     messages = [
@@ -489,9 +512,9 @@ def analyze_image_with_llm(image, gradcam_overlay, face_box, pred_label, confide
 # Main app
 def main():
     # Create placeholders for model state
-    if 'resnet_model_loaded' not in st.session_state:
-        st.session_state.resnet_model_loaded = False
-        st.session_state.resnet_model = None
     if 'llm_model_loaded' not in st.session_state:
         st.session_state.llm_model_loaded = False
@@ -500,22 +523,22 @@ def main():
     # Create expanders for each stage
     with st.expander("Stage 1: Model Loading", expanded=True):
-        # Button for loading models
-        resnet_col, llm_col = st.columns(2)
-        with resnet_col:
-            if not st.session_state.resnet_model_loaded:
-                if st.button("📥 Load ResNet for Visualization", type="primary"):
-                    # Load ResNet model
-                    model = load_resnet_model()
                     if model is not None:
-                        st.session_state.resnet_model = model
-                        st.session_state.resnet_model_loaded = True
-                        st.success("✅ ResNet model loaded successfully!")
                     else:
-                        st.error("❌ Failed to load ResNet model.")
             else:
-                st.success("✅ ResNet model loaded and ready!")
         with llm_col:
             if not st.session_state.llm_model_loaded:
@@ -533,7 +556,7 @@ def main():
                 st.success("✅ Vision LLM loaded and ready!")
     # Image upload section
-    with st.expander("Stage 2: Image Upload & Initial Analysis", expanded=True):
         st.subheader("Upload an Image")
         uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
@@ -542,14 +565,14 @@ def main():
             image = Image.open(uploaded_file).convert("RGB")
             st.image(image, caption="Uploaded Image", use_column_width=True)
-            # Analyze with ResNet model if loaded
-            if st.session_state.resnet_model_loaded:
-                with st.spinner("Analyzing image..."):
-                    # Preprocess image
                     transform = transforms.Compose([
                         transforms.Resize((224, 224)),
                         transforms.ToTensor(),
-                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                     ])
                     # Create a simple dataset for the image
@@ -560,27 +583,34 @@ def main():
                     # Get device
                     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-                    # Analyze face (simplified)
-                    pred_label, confidence = analyze_face(tensor, device)
                     # Display results
                     result_col1, result_col2 = st.columns(2)
                     with result_col1:
-                        st.metric("Initial Classification", pred_label)
                     with result_col2:
                         st.metric("Confidence", f"{confidence:.2%}")
                     # GradCAM visualization
-                    st.subheader("Feature Visualization")
-                    # Use the first class for visualization
-                    class_idx = 0
                     cam, overlay, comparison, detected_face_box = process_image_with_gradcam(
-                        image, st.session_state.resnet_model, device, class_idx
                     )
                     # Display GradCAM results
-                    st.image(comparison, caption="Original | Heatmap | Overlay", use_column_width=True)
                     # Save results in session state for LLM analysis
                     st.session_state.current_image = image
@@ -589,9 +619,9 @@ def main():
                     st.session_state.current_pred_label = pred_label
                     st.session_state.current_confidence = confidence
-                    st.success("✅ Initial analysis and visualization complete!")
             else:
-                st.warning("⚠️ Please load the ResNet model first to perform initial analysis.")
     # LLM Analysis section
     with st.expander("Stage 3: Detailed Analysis with Vision LLM", expanded=False):
@@ -599,7 +629,7 @@ def main():
             st.subheader("Detailed Deepfake Analysis")
             # Default question with option to customize
-            default_question = f"This image has been preliminarily classified as {st.session_state.current_pred_label}. Analyze whether this image is a deepfake, focusing on the highlighted areas in the visualization. Provide both a technical explanation for experts and a simple explanation for non-technical users."
             question = st.text_area("Question/Prompt:", value=default_question, height=100)
             # Analyze button
@@ -642,7 +672,7 @@ def main():
                     st.subheader("Analysis Result")
                     st.markdown(result)
         elif not hasattr(st.session_state, 'current_image'):
-            st.warning("⚠️ Please upload an image and complete the initial analysis first.")
         else:
             st.warning("⚠️ Please load the Vision LLM to perform detailed analysis.")

 import torch.nn as nn
 from torch.utils.data import DataLoader
 from torchvision import transforms
+from transformers.models.clip import CLIPModel  # Updated import path
 from PIL import Image
 import numpy as np
 import io
 # Custom instruction text area in sidebar
 custom_instruction = st.sidebar.text_area(
     "Custom Instructions (Advanced)",
+    value="Focus on analyzing the highlighted regions from the GradCAM visualization. Examine facial inconsistencies, lighting irregularities, and other artifacts visible in the heat map.",
     help="Add specific instructions for the LLM analysis"
 )
 st.sidebar.subheader("About")
 st.sidebar.markdown("""
 This analyzer performs multi-stage detection:
+1. **Initial Detection**: CLIP-based classifier
+2. **GradCAM Visualization**: Highlights suspicious regions
 3. **LLM Analysis**: Fine-tuned Llama 3.2 Vision provides detailed explanations
 The system looks for:
 - Blending problems
 """)
+# ----- GradCAM Implementation -----
 class ImageDataset(torch.utils.data.Dataset):
     def __init__(self, image, transform=None, face_only=True, dataset_name=None):
             return image_tensor, label, "uploaded_image", original_image, None, self.dataset_name
 class GradCAM:
     def __init__(self, model, target_layer):
         self.model = model
     def _register_hooks(self):
         def forward_hook(module, input, output):
+            if isinstance(output, tuple):
+                self.activations = output[0]
+            else:
+                self.activations = output
         def backward_hook(module, grad_in, grad_out):
+            if isinstance(grad_out, tuple):
+                self.gradients = grad_out[0]
+            else:
+                self.gradients = grad_out
+        layer = dict([*self.model.named_modules()])[self.target_layer]
+        layer.register_forward_hook(forward_hook)
+        layer.register_backward_hook(backward_hook)
+    def generate(self, input_tensor, class_idx):
         self.model.zero_grad()
         try:
+            # Use only the vision part of the model for gradient calculation
+            vision_outputs = self.model.vision_model(pixel_values=input_tensor)
+            # Get the pooler output
+            features = vision_outputs.pooler_output
+            # Create a dummy gradient for the feature based on the class idx
+            one_hot = torch.zeros_like(features)
+            one_hot[0, class_idx] = 1
+            # Manually backpropagate
+            features.backward(gradient=one_hot)
+            # Check for None values
+            if self.gradients is None or self.activations is None:
+                st.warning("Warning: Gradients or activations are None. Using fallback CAM.")
+                return np.ones((14, 14), dtype=np.float32) * 0.5
+            # Process gradients and activations for transformer-based model
+            gradients = self.gradients.cpu().detach().numpy()
+            activations = self.activations.cpu().detach().numpy()
+            if len(activations.shape) == 3:  # [batch, sequence_length, hidden_dim]
+                seq_len = activations.shape[1]
+                # CLIP ViT typically has 196 patch tokens (14×14) + 1 class token = 197
+                if seq_len >= 197:
+                    # Skip the class token (first token) and reshape the patch tokens into a square
+                    patch_tokens = activations[0, 1:197, :]  # Remove the class token
+                    # Take the mean across the hidden dimension
+                    token_importance = np.mean(np.abs(patch_tokens), axis=1)
+                    # Reshape to the expected grid size (14×14 for CLIP ViT)
+                    cam = token_importance.reshape(14, 14)
+                else:
+                    # Try to find factors close to a square
+                    side_len = int(np.sqrt(seq_len))
+                    # Use the mean across features as importance
+                    token_importance = np.mean(np.abs(activations[0]), axis=1)
+                    # Create as square-like shape as possible
+                    cam = np.zeros((side_len, side_len))
+                    # Fill the cam with available values
+                    flat_cam = cam.flatten()
+                    flat_cam[:min(len(token_importance), len(flat_cam))] = token_importance[:min(len(token_importance), len(flat_cam))]
+                    cam = flat_cam.reshape(side_len, side_len)
+            else:
+                # Fallback
+                st.info("Using fallback CAM shape (14x14)")
+                cam = np.ones((14, 14), dtype=np.float32) * 0.5  # Default fallback
+            # Ensure we have valid values
             cam = np.maximum(cam, 0)
             if np.max(cam) > 0:
                 cam = cam / np.max(cam)
         except Exception as e:
             st.error(f"Error in GradCAM.generate: {str(e)}")
+            return np.ones((14, 14), dtype=np.float32) * 0.5
 def overlay_cam_on_image(image, cam, face_box=None, alpha=0.5):
     """Overlay the CAM on the image"""
     else:
         cam_resized = cv2.resize(cam, (image.width, image.height))
         axes[1].imshow(cam_resized, cmap="jet")
+    axes[1].set_title("CAM")
     axes[1].axis("off")
     # Overlay
     buf.seek(0)
     return Image.open(buf)
+# Function to load GradCAM CLIP model
 @st.cache_resource
+def load_clip_model():
+    with st.spinner("Loading CLIP model for GradCAM..."):
+        model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+        # Apply a simple classification head
+        model.classification_head = nn.Linear(1024, 2)
+        model.classification_head.weight.data.normal_(mean=0.0, std=0.02)
+        model.classification_head.bias.data.zero_()
         model.eval()
         return model
+def get_target_layer_clip(model):
+    """Get the target layer for GradCAM"""
+    return "vision_model.encoder.layers.23"
+def process_image_with_gradcam(image, model, device, pred_class):
+    """Process an image with GradCAM"""
     # Set up transformations
     transform = transforms.Compose([
         transforms.Resize((224, 224)),
         transforms.ToTensor(),
+        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
     ])
     # Create dataset for the single image
         try:
             # Create GradCAM extractor
+            target_layer = get_target_layer_clip(model)
             cam_extractor = GradCAM(model, target_layer)
             # Generate CAM
+            cam = cam_extractor.generate(input_tensor, pred_class)
             # Create visualizations
             overlay = overlay_cam_on_image(original_image, cam, face_box)
         except Exception as e:
             st.error(f"Error processing image with GradCAM: {str(e)}")
             # Return default values
+            default_cam = np.ones((14, 14), dtype=np.float32) * 0.5
             overlay = overlay_cam_on_image(original_image, default_cam, face_box)
             comparison = save_comparison(original_image, default_cam, overlay, face_box)
             return default_cam, overlay, comparison, face_box
 # ----- Fine-tuned Vision LLM -----
 # Function to fix cross-attention masks
 def analyze_image_with_llm(image, gradcam_overlay, face_box, pred_label, confidence, question, model, tokenizer, temperature=0.7, max_tokens=500, custom_instruction=""):
     # Create a prompt that includes GradCAM information
     if custom_instruction.strip():
+        full_prompt = f"{question}\n\nThe image has been processed with GradCAM and classified as {pred_label} with confidence {confidence:.2f}. Focus on the highlighted regions in red/yellow which show the areas the detection model found suspicious.\n\n{custom_instruction}"
     else:
+        full_prompt = f"{question}\n\nThe image has been processed with GradCAM and classified as {pred_label} with confidence {confidence:.2f}. Focus on the highlighted regions in red/yellow which show the areas the detection model found suspicious."
     # Format the message to include both the original image and the GradCAM visualization
     messages = [
 # Main app
 def main():
     # Create placeholders for model state
+    if 'clip_model_loaded' not in st.session_state:
+        st.session_state.clip_model_loaded = False
+        st.session_state.clip_model = None
     if 'llm_model_loaded' not in st.session_state:
         st.session_state.llm_model_loaded = False
     # Create expanders for each stage
     with st.expander("Stage 1: Model Loading", expanded=True):
+        # Button for loading CLIP model
+        clip_col, llm_col = st.columns(2)
+        with clip_col:
+            if not st.session_state.clip_model_loaded:
+                if st.button("📥 Load CLIP Model for Detection", type="primary"):
+                    # Load CLIP model
+                    model = load_clip_model()
                     if model is not None:
+                        st.session_state.clip_model = model
+                        st.session_state.clip_model_loaded = True
+                        st.success("✅ CLIP model loaded successfully!")
                     else:
+                        st.error("❌ Failed to load CLIP model.")
             else:
+                st.success("✅ CLIP model loaded and ready!")
         with llm_col:
             if not st.session_state.llm_model_loaded:
                 st.success("✅ Vision LLM loaded and ready!")
     # Image upload section
+    with st.expander("Stage 2: Image Upload & Initial Detection", expanded=True):
         st.subheader("Upload an Image")
         uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
             image = Image.open(uploaded_file).convert("RGB")
             st.image(image, caption="Uploaded Image", use_column_width=True)
+            # Detect with CLIP model if loaded
+            if st.session_state.clip_model_loaded:
+                with st.spinner("Analyzing image with CLIP model..."):
+                    # Preprocess image for CLIP
                     transform = transforms.Compose([
                         transforms.Resize((224, 224)),
                         transforms.ToTensor(),
+                        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
                     ])
                     # Create a simple dataset for the image
                     # Get device
                     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+                    # Move model and tensor to device
+                    model = st.session_state.clip_model.to(device)
+                    tensor = tensor.to(device)
+                    # Forward pass
+                    with torch.no_grad():
+                        outputs = model.vision_model(pixel_values=tensor).pooler_output
+                        logits = model.classification_head(outputs)
+                        probs = torch.softmax(logits, dim=1)[0]
+                        pred_class = torch.argmax(probs).item()
+                        confidence = probs[pred_class].item()
+                        pred_label = "Fake" if pred_class == 1 else "Real"
                     # Display results
                     result_col1, result_col2 = st.columns(2)
                     with result_col1:
+                        st.metric("Prediction", pred_label)
                     with result_col2:
                         st.metric("Confidence", f"{confidence:.2%}")
                     # GradCAM visualization
+                    st.subheader("GradCAM Visualization")
                     cam, overlay, comparison, detected_face_box = process_image_with_gradcam(
+                        image, model, device, pred_class
                     )
                     # Display GradCAM results
+                    st.image(comparison, caption="Original | CAM | Overlay", use_column_width=True)
                     # Save results in session state for LLM analysis
                     st.session_state.current_image = image
                     st.session_state.current_pred_label = pred_label
                     st.session_state.current_confidence = confidence
+                    st.success("✅ Initial detection and GradCAM visualization complete!")
             else:
+                st.warning("⚠️ Please load the CLIP model first to perform initial detection.")
     # LLM Analysis section
     with st.expander("Stage 3: Detailed Analysis with Vision LLM", expanded=False):
             st.subheader("Detailed Deepfake Analysis")
             # Default question with option to customize
+            default_question = f"This image has been classified as {st.session_state.current_pred_label}. Analyze the key features that led to this classification, focusing on the highlighted areas in the GradCAM visualization. Provide both a technical explanation for experts and a simple explanation for non-technical users."
             question = st.text_area("Question/Prompt:", value=default_question, height=100)
             # Analyze button
                     st.subheader("Analysis Result")
                     st.markdown(result)
         elif not hasattr(st.session_state, 'current_image'):
+            st.warning("⚠️ Please upload an image and complete the initial detection first.")
         else:
             st.warning("⚠️ Please load the Vision LLM to perform detailed analysis.")