Spaces:

saakshigupta
/

deepfake-explainer-app

Paused

App Files Files Community

saakshigupta commited on Apr 6, 2025

Commit

f0a1db6

verified ·

1 Parent(s): 3bb619d

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -6

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
 from torchvision import transforms
-from transformers import CLIPModel
 from transformers.models.clip import CLIPModel
 from PIL import Image
 import numpy as np
@@ -76,7 +76,8 @@ st.sidebar.markdown("""
 This analyzer performs multi-stage detection:
 1. **Initial Detection**: CLIP-based classifier
 2. **GradCAM Visualization**: Highlights suspicious regions
-3. **LLM Analysis**: Fine-tuned Llama 3.2 Vision provides detailed explanations
 The system looks for:
 - Facial inconsistencies
@@ -417,6 +418,55 @@ def process_image_with_gradcam(image, model, device, pred_class):
             comparison = save_comparison(original_image, default_cam, overlay, face_box)
             return default_cam, overlay, comparison, face_box
 # ----- Fine-tuned Vision LLM -----
 # Function to fix cross-attention masks
@@ -522,10 +572,15 @@ def main():
         st.session_state.llm_model = None
         st.session_state.tokenizer = None
     # Create expanders for each stage
     with st.expander("Stage 1: Model Loading", expanded=True):
-        # Button for loading CLIP model
-        clip_col, llm_col = st.columns(2)
         with clip_col:
             if not st.session_state.clip_model_loaded:
@@ -555,6 +610,21 @@ def main():
                         st.error("❌ Failed to load Vision LLM.")
             else:
                 st.success("✅ Vision LLM loaded and ready!")
     # Image upload section
     with st.expander("Stage 2: Image Upload & Initial Detection", expanded=True):
@@ -566,6 +636,17 @@ def main():
             image = Image.open(uploaded_file).convert("RGB")
             st.image(image, caption="Uploaded Image", use_column_width=True)
             # Detect with CLIP model if loaded
             if st.session_state.clip_model_loaded:
                 with st.spinner("Analyzing image with CLIP model..."):
@@ -629,8 +710,13 @@ def main():
         if hasattr(st.session_state, 'current_image') and st.session_state.llm_model_loaded:
             st.subheader("Detailed Deepfake Analysis")
             # Default question with option to customize
-            default_question = f"This image has been classified as {st.session_state.current_pred_label}. Analyze the key features that led to this classification, focusing on the highlighted areas in the GradCAM visualization. Provide both a technical explanation for experts and a simple explanation for non-technical users."
             question = st.text_area("Question/Prompt:", value=default_question, height=100)
             # Analyze button
@@ -676,10 +762,28 @@ def main():
             st.warning("⚠️ Please upload an image and complete the initial detection first.")
         else:
             st.warning("⚠️ Please load the Vision LLM to perform detailed analysis.")
     # Footer
     st.markdown("---")
-    st.caption("Advanced Deepfake Image Analyzer")
 if __name__ == "__main__":
     main()

 import torch.nn as nn
 from torch.utils.data import DataLoader
 from torchvision import transforms
+from transformers import CLIPModel, BlipProcessor, BlipForConditionalGeneration
 from transformers.models.clip import CLIPModel
 from PIL import Image
 import numpy as np
 This analyzer performs multi-stage detection:
 1. **Initial Detection**: CLIP-based classifier
 2. **GradCAM Visualization**: Highlights suspicious regions
+3. **Image Captioning**: BLIP model describes the image content
+4. **LLM Analysis**: Fine-tuned Llama 3.2 Vision provides detailed explanations
 The system looks for:
 - Facial inconsistencies
             comparison = save_comparison(original_image, default_cam, overlay, face_box)
             return default_cam, overlay, comparison, face_box
+# ----- BLIP Image Captioning -----
+# Function to load BLIP captioning model
+@st.cache_resource
+def load_blip_model():
+    with st.spinner("Loading BLIP captioning model..."):
+        try:
+            processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+            model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+            return processor, model
+        except Exception as e:
+            st.error(f"Error loading BLIP model: {str(e)}")
+            return None, None
+# Function to generate image caption
+def generate_image_caption(image, processor, model, max_length=50, num_beams=5):
+    """
+    Generate a caption for the input image using BLIP model
+    Args:
+        image (PIL.Image): Input image
+        processor: BLIP processor
+        model: BLIP model
+        max_length (int): Maximum length of the caption
+        num_beams (int): Number of beams for beam search
+    Returns:
+        str: Generated caption
+    """
+    try:
+        # Preprocess the image
+        inputs = processor(image, return_tensors="pt")
+        # Check for available GPU
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = model.to(device)
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate caption
+        with torch.no_grad():
+            output = model.generate(**inputs, max_length=max_length, num_beams=num_beams)
+        # Decode the caption
+        caption = processor.decode(output[0], skip_special_tokens=True)
+        return caption
+    except Exception as e:
+        st.error(f"Error generating caption: {str(e)}")
+        return "Error generating caption"
 # ----- Fine-tuned Vision LLM -----
 # Function to fix cross-attention masks
         st.session_state.llm_model = None
         st.session_state.tokenizer = None
+    if 'blip_model_loaded' not in st.session_state:
+        st.session_state.blip_model_loaded = False
+        st.session_state.blip_processor = None
+        st.session_state.blip_model = None
     # Create expanders for each stage
     with st.expander("Stage 1: Model Loading", expanded=True):
+        # Button for loading models
+        clip_col, llm_col, blip_col = st.columns(3)
         with clip_col:
             if not st.session_state.clip_model_loaded:
                         st.error("❌ Failed to load Vision LLM.")
             else:
                 st.success("✅ Vision LLM loaded and ready!")
+        with blip_col:
+            if not st.session_state.blip_model_loaded:
+                if st.button("📥 Load BLIP for Captioning", type="primary"):
+                    # Load BLIP model
+                    processor, model = load_blip_model()
+                    if model is not None and processor is not None:
+                        st.session_state.blip_processor = processor
+                        st.session_state.blip_model = model
+                        st.session_state.blip_model_loaded = True
+                        st.success("✅ BLIP captioning model loaded successfully!")
+                    else:
+                        st.error("❌ Failed to load BLIP model.")
+            else:
+                st.success("✅ BLIP captioning model loaded and ready!")
     # Image upload section
     with st.expander("Stage 2: Image Upload & Initial Detection", expanded=True):
             image = Image.open(uploaded_file).convert("RGB")
             st.image(image, caption="Uploaded Image", use_column_width=True)
+            # Generate image caption if BLIP model is loaded
+            if st.session_state.blip_model_loaded:
+                with st.spinner("Generating image caption..."):
+                    caption = generate_image_caption(
+                        image,
+                        st.session_state.blip_processor,
+                        st.session_state.blip_model
+                    )
+                    st.session_state.image_caption = caption
+                    st.success(f"📝 Image Caption: **{caption}**")
             # Detect with CLIP model if loaded
             if st.session_state.clip_model_loaded:
                 with st.spinner("Analyzing image with CLIP model..."):
         if hasattr(st.session_state, 'current_image') and st.session_state.llm_model_loaded:
             st.subheader("Detailed Deepfake Analysis")
+            # Include caption in the prompt if available
+            caption_text = ""
+            if hasattr(st.session_state, 'image_caption'):
+                caption_text = f"\n\nImage caption: {st.session_state.image_caption}"
             # Default question with option to customize
+            default_question = f"This image has been classified as {st.session_state.current_pred_label}.{caption_text} Analyze the key features that led to this classification, focusing on the highlighted areas in the GradCAM visualization. Provide both a technical explanation for experts and a simple explanation for non-technical users."
             question = st.text_area("Question/Prompt:", value=default_question, height=100)
             # Analyze button
             st.warning("⚠️ Please upload an image and complete the initial detection first.")
         else:
             st.warning("⚠️ Please load the Vision LLM to perform detailed analysis.")
+    # Summary section with caption
+    if hasattr(st.session_state, 'current_image') and hasattr(st.session_state, 'image_caption'):
+        with st.expander("Image Caption Summary", expanded=True):
+            st.subheader("Generated Image Description")
+            # Display image and caption
+            col1, col2 = st.columns([1, 2])
+            with col1:
+                st.image(st.session_state.current_image, use_column_width=True)
+            with col2:
+                st.markdown("### BLIP Caption:")
+                st.markdown(f"**{st.session_state.image_caption}**")
+                # Display detection result if available
+                if hasattr(st.session_state, 'current_pred_label'):
+                    st.markdown("### Detection Result:")
+                    st.markdown(f"Classification: **{st.session_state.current_pred_label}** (Confidence: {st.session_state.current_confidence:.2%})")
     # Footer
     st.markdown("---")
+    st.caption("Advanced Deepfake Image Analyzer with BLIP Captioning")
 if __name__ == "__main__":
     main()