Spaces:

saakshigupta
/

deepfake-explainer-app

Paused

App Files Files Community

saakshigupta commited on Apr 6, 2025

Commit

4ae113a

verified ·

1 Parent(s): be65f5f

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -117

app.py CHANGED Viewed

@@ -436,24 +436,60 @@ def load_blip_model():
             st.error(f"Error loading BLIP model: {str(e)}")
             return None, None
-# Function to generate image caption using BLIP
-def generate_image_caption(image, processor, model, is_gradcam=False, max_length=75, num_beams=5):
     """
-    Generate a caption for the input image using BLIP model
     """
     try:
         # Check for available GPU
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model = model.to(device)
-        # Choose the right prompting method based on image type
-        if is_gradcam:
-            # For GradCAM, use conditional captioning with a specific prompt
-            text = "a heatmap showing"
-            inputs = processor(image, text, return_tensors="pt").to(device)
-        else:
-            # For original image, use unconditional captioning (works better for portraits)
-            inputs = processor(image, return_tensors="pt").to(device)
         # Generate caption
         with torch.no_grad():
@@ -462,24 +498,8 @@ def generate_image_caption(image, processor, model, is_gradcam=False, max_length
         # Decode the output
         caption = processor.decode(output[0], skip_special_tokens=True)
-        # Remove the prompt from the beginning if it appears (for conditional captioning)
-        if is_gradcam and "a heatmap showing" in caption:
-            caption = caption.replace("a heatmap showing", "").strip()
-        # Format based on image type
-        if is_gradcam:
-            return format_gradcam_caption(caption)
-        else:
-            return format_image_caption(caption)
-    except Exception as e:
-        st.error(f"Error generating caption: {str(e)}")
-        return "Error generating caption"
-def format_image_caption(caption):
-    """Format caption into a structured description with headings"""
-    structured_caption = f"""
 **Subject**: The image shows a person in a photograph.
 **Appearance**: {caption}
@@ -492,23 +512,11 @@ def format_image_caption(caption):
 **Notable Elements**: The facial features and expression are the central focus of the image.
 """
-    return structured_caption.strip()
-def format_gradcam_caption(caption):
-    """Format GradCAM caption with proper structure"""
-    structured_caption = f"""
-**Main Focus Area**: The heatmap is primarily focused on the facial region of the person.
-**High Activation Regions**: The red/yellow areas highlight important features that the model is focusing on. {caption}
-**Medium Activation Regions**: The green/cyan areas correspond to regions of medium importance in the detection process, typically including parts of the face and surrounding areas.
-**Low Activation Regions**: The blue/dark blue areas represent features that have less impact on the model's decision, usually the background and peripheral elements.
-**Activation Pattern**: The overall pattern suggests the model is primarily analyzing facial features to make its determination of authenticity.
-"""
-    return structured_caption.strip()
 # ----- Fine-tuned Vision LLM -----
@@ -520,7 +528,6 @@ def fix_cross_attention_mask(inputs):
         new_mask = torch.ones((batch_size, seq_len, visual_features, num_tiles),
                             device=inputs['cross_attention_mask'].device)
         inputs['cross_attention_mask'] = new_mask
-        st.success("Fixed cross-attention mask dimensions")
     return inputs
 # Load model function
@@ -605,7 +612,7 @@ def analyze_image_with_llm(image, gradcam_overlay, face_box, pred_label, confide
 # Main app
 def main():
-    # Create placeholders for model state
     if 'clip_model_loaded' not in st.session_state:
         st.session_state.clip_model_loaded = False
         st.session_state.clip_model = None
@@ -620,12 +627,16 @@ def main():
         st.session_state.blip_processor = None
         st.session_state.blip_model = None
     # Create expanders for each stage
     with st.expander("Stage 1: Model Loading", expanded=True):
         st.write("Please load the models using the buttons below:")
         # Button for loading models
-        clip_col, llm_col, blip_col = st.columns(3)
         with clip_col:
             if not st.session_state.clip_model_loaded:
@@ -641,21 +652,6 @@ def main():
             else:
                 st.success("✅ CLIP model loaded and ready!")
-        with llm_col:
-            if not st.session_state.llm_model_loaded:
-                if st.button("📥 Load Vision LLM for Analysis", type="primary"):
-                    # Load LLM model
-                    model, tokenizer = load_llm_model()
-                    if model is not None and tokenizer is not None:
-                        st.session_state.llm_model = model
-                        st.session_state.tokenizer = tokenizer
-                        st.session_state.llm_model_loaded = True
-                        st.success("✅ Vision LLM loaded successfully!")
-                    else:
-                        st.error("❌ Failed to load Vision LLM.")
-            else:
-                st.success("✅ Vision LLM loaded and ready!")
         with blip_col:
             if not st.session_state.blip_model_loaded:
                 if st.button("📥 Load BLIP for Captioning", type="primary"):
@@ -670,6 +666,21 @@ def main():
                         st.error("❌ Failed to load BLIP model.")
             else:
                 st.success("✅ BLIP captioning model loaded and ready!")
     # Image upload section
     with st.expander("Stage 2: Image Upload & Initial Detection", expanded=True):
@@ -692,12 +703,11 @@ def main():
                         caption = generate_image_caption(
                             image,
                             st.session_state.blip_processor,
-                            st.session_state.blip_model,
-                            is_gradcam=False
                         )
                         st.session_state.image_caption = caption
-                        # Store caption but don't display it here - it will be shown in the summary section
                 # Detect with CLIP model if loaded
                 if st.session_state.clip_model_loaded:
@@ -732,11 +742,8 @@ def main():
                         # Display results
                         with col2:
-                            result_col1, result_col2 = st.columns(2)
-                            with result_col1:
-                                st.metric("Prediction", pred_label)
-                            with result_col2:
-                                st.metric("Confidence", f"{confidence:.2%}")
                         # GradCAM visualization
                         st.subheader("GradCAM Visualization")
@@ -750,16 +757,14 @@ def main():
                         # Generate caption for GradCAM overlay image if BLIP model is loaded
                         if st.session_state.blip_model_loaded:
                             with st.spinner("Analyzing GradCAM visualization..."):
-                                gradcam_caption = generate_image_caption(
                                     overlay,
                                     st.session_state.blip_processor,
-                                    st.session_state.blip_model,
-                                    is_gradcam=True,
-                                    max_length=150  # Longer for detailed analysis
                                 )
                                 st.session_state.gradcam_caption = gradcam_caption
-                                # Store caption but don't display it here - it will be shown in the summary section
                         # Save results in session state for LLM analysis
                         st.session_state.current_image = image
@@ -776,11 +781,49 @@ def main():
                 import traceback
                 st.error(traceback.format_exc())  # This will show the full error traceback
-    # LLM Analysis section
     with st.expander("Stage 3: Detailed Analysis with Vision LLM", expanded=False):
         if hasattr(st.session_state, 'current_image') and st.session_state.llm_model_loaded:
             st.subheader("Detailed Deepfake Analysis")
             # Include both captions in the prompt if available
             caption_text = ""
             if hasattr(st.session_state, 'image_caption'):
@@ -790,19 +833,37 @@ def main():
                 caption_text += f"\n\nGradCAM Analysis:\n{st.session_state.gradcam_caption}"
             # Default question with option to customize
-            default_question = f"This image has been classified as {st.session_state.current_pred_label}.{caption_text} Analyze the key features that led to this classification, focusing on the highlighted areas in the GradCAM visualization. Provide both a technical explanation for experts and a simple explanation for non-technical users."
-            question = st.text_area("Question/Prompt:", value=default_question, height=100)
-            # Analyze button
-            if st.button("🔍 Perform Detailed Analysis", type="primary"):
                 try:
                     result = analyze_image_with_llm(
                         st.session_state.current_image,
                         st.session_state.current_overlay,
                         st.session_state.current_face_box,
                         st.session_state.current_pred_label,
                         st.session_state.current_confidence,
-                        question,
                         st.session_state.llm_model,
                         st.session_state.tokenizer,
                         temperature=temperature,
@@ -810,7 +871,10 @@ def main():
                         custom_instruction=custom_instruction
                     )
-                    # Display results
                     st.success("✅ Analysis complete!")
                     # Check if the result contains both technical and non-technical explanations
@@ -822,12 +886,12 @@ def main():
                             non_technical = "Non-Technical" + parts[1]
                             # Display in two columns
-                            col1, col2 = st.columns(2)
-                            with col1:
                                 st.subheader("Technical Analysis")
                                 st.markdown(technical)
-                            with col2:
                                 st.subheader("Simple Explanation")
                                 st.markdown(non_technical)
                         except Exception as e:
@@ -838,6 +902,10 @@ def main():
                         # Just display the whole result
                         st.subheader("Analysis Result")
                         st.markdown(result)
                 except Exception as e:
                     st.error(f"Error during LLM analysis: {str(e)}")
@@ -846,36 +914,6 @@ def main():
         else:
             st.warning("⚠️ Please load the Vision LLM to perform detailed analysis.")
-    # Summary section with caption
-    if hasattr(st.session_state, 'current_image') and (hasattr(st.session_state, 'image_caption') or hasattr(st.session_state, 'gradcam_caption')):
-        with st.expander("Image Analysis Summary", expanded=True):
-            st.subheader("Generated Descriptions and Analysis")
-            # Display image, captions, and results in organized layout
-            col1, col2 = st.columns([1, 2])
-            with col1:
-                # Display original image and overlay side by side with controlled size
-                st.image(st.session_state.current_image, caption="Original Image", width=300)
-                if hasattr(st.session_state, 'current_overlay'):
-                    st.image(st.session_state.current_overlay, caption="GradCAM Overlay", width=300)
-            with col2:
-                # Detection result
-                if hasattr(st.session_state, 'current_pred_label'):
-                    st.markdown(f"### Detection Result:")
-                    st.markdown(f"Classification: **{st.session_state.current_pred_label}** (Confidence: {st.session_state.current_confidence:.2%})")
-                # Image description
-                if hasattr(st.session_state, 'image_caption'):
-                    st.markdown("### Image Description:")
-                    st.markdown(st.session_state.image_caption)
-                # GradCAM analysis
-                if hasattr(st.session_state, 'gradcam_caption'):
-                    st.markdown("### GradCAM Analysis:")
-                    st.markdown(st.session_state.gradcam_caption)
     # Footer
     st.markdown("---")
     st.caption("Advanced Deepfake Image Analyzer with Structured BLIP Captioning")

             st.error(f"Error loading BLIP model: {str(e)}")
             return None, None
+# Function to generate image caption using BLIP's VQA approach for GradCAM
+def generate_gradcam_caption(image, processor, model, max_length=60):
     """
+    Generate a detailed analysis of GradCAM visualization using multiple questions
     """
     try:
         # Check for available GPU
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model = model.to(device)
+        # Multiple specific questions about the GradCAM visualization
+        questions = [
+            "What facial features are highlighted by the red and yellow areas in this heatmap?",
+            "What does this facial heat map visualization show?",
+            "What patterns do you see in this facial heatmap visualization?"
+        ]
+        # Get answers to each question
+        answers = []
+        for question in questions:
+            inputs = processor(image, text=question, return_tensors="pt").to(device)
+            with torch.no_grad():
+                output = model.generate(**inputs, max_length=max_length, num_beams=5)
+            answer = processor.decode(output[0], skip_special_tokens=True)
+            answers.append(answer)
+        # Format answers into a structured analysis
+        structured_output = f"""
+**Main Focus Area**: The heatmap is primarily focused on the facial region of the person.
+**High Activation Regions**: The red/yellow areas highlight {answers[0]}
+**Medium Activation Regions**: The green/cyan areas correspond to regions of medium importance in the detection process, typically including parts of the face and surrounding areas.
+**Low Activation Regions**: The blue/dark blue areas represent features that have less impact on the model's decision, usually the background and peripheral elements.
+**Activation Pattern**: {answers[2]}
+"""
+        return structured_output.strip()
+    except Exception as e:
+        st.error(f"Error analyzing GradCAM: {str(e)}")
+        return "Error analyzing GradCAM visualization"
+# Function to generate caption for original image
+def generate_image_caption(image, processor, model, max_length=75, num_beams=5):
+    """Generate a caption for the original image using BLIP model"""
+    try:
+        # Check for available GPU
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = model.to(device)
+        # For original image, use unconditional captioning
+        inputs = processor(image, return_tensors="pt").to(device)
         # Generate caption
         with torch.no_grad():
         # Decode the output
         caption = processor.decode(output[0], skip_special_tokens=True)
+        # Format into structured description
+        structured_caption = f"""
 **Subject**: The image shows a person in a photograph.
 **Appearance**: {caption}
 **Notable Elements**: The facial features and expression are the central focus of the image.
 """
+        return structured_caption.strip()
+    except Exception as e:
+        st.error(f"Error generating caption: {str(e)}")
+        return "Error generating caption"
 # ----- Fine-tuned Vision LLM -----
         new_mask = torch.ones((batch_size, seq_len, visual_features, num_tiles),
                             device=inputs['cross_attention_mask'].device)
         inputs['cross_attention_mask'] = new_mask
     return inputs
 # Load model function
 # Main app
 def main():
+    # Initialize session state variables
     if 'clip_model_loaded' not in st.session_state:
         st.session_state.clip_model_loaded = False
         st.session_state.clip_model = None
         st.session_state.blip_processor = None
         st.session_state.blip_model = None
+    # Initialize chat history
+    if 'chat_history' not in st.session_state:
+        st.session_state.chat_history = []
     # Create expanders for each stage
     with st.expander("Stage 1: Model Loading", expanded=True):
         st.write("Please load the models using the buttons below:")
         # Button for loading models
+        clip_col, blip_col, llm_col = st.columns(3)
         with clip_col:
             if not st.session_state.clip_model_loaded:
             else:
                 st.success("✅ CLIP model loaded and ready!")
         with blip_col:
             if not st.session_state.blip_model_loaded:
                 if st.button("📥 Load BLIP for Captioning", type="primary"):
                         st.error("❌ Failed to load BLIP model.")
             else:
                 st.success("✅ BLIP captioning model loaded and ready!")
+        with llm_col:
+            if not st.session_state.llm_model_loaded:
+                if st.button("📥 Load Vision LLM for Analysis", type="primary"):
+                    # Load LLM model
+                    model, tokenizer = load_llm_model()
+                    if model is not None and tokenizer is not None:
+                        st.session_state.llm_model = model
+                        st.session_state.tokenizer = tokenizer
+                        st.session_state.llm_model_loaded = True
+                        st.success("✅ Vision LLM loaded successfully!")
+                    else:
+                        st.error("❌ Failed to load Vision LLM.")
+            else:
+                st.success("✅ Vision LLM loaded and ready!")
     # Image upload section
     with st.expander("Stage 2: Image Upload & Initial Detection", expanded=True):
                         caption = generate_image_caption(
                             image,
                             st.session_state.blip_processor,
+                            st.session_state.blip_model
                         )
                         st.session_state.image_caption = caption
+                        # Store caption but don't display it yet
                 # Detect with CLIP model if loaded
                 if st.session_state.clip_model_loaded:
                         # Display results
                         with col2:
+                            st.markdown("### Detection Result")
+                            st.markdown(f"**Classification:** {pred_label} (Confidence: {confidence:.2%})")
                         # GradCAM visualization
                         st.subheader("GradCAM Visualization")
                         # Generate caption for GradCAM overlay image if BLIP model is loaded
                         if st.session_state.blip_model_loaded:
                             with st.spinner("Analyzing GradCAM visualization..."):
+                                gradcam_caption = generate_gradcam_caption(
                                     overlay,
                                     st.session_state.blip_processor,
+                                    st.session_state.blip_model
                                 )
                                 st.session_state.gradcam_caption = gradcam_caption
+                                # Store caption but don't display it yet
                         # Save results in session state for LLM analysis
                         st.session_state.current_image = image
                 import traceback
                 st.error(traceback.format_exc())  # This will show the full error traceback
+    # Image Analysis Summary section - AFTER Stage 2
+    if hasattr(st.session_state, 'current_image') and (hasattr(st.session_state, 'image_caption') or hasattr(st.session_state, 'gradcam_caption')):
+        with st.expander("Image Analysis Summary", expanded=True):
+            st.subheader("Generated Descriptions and Analysis")
+            # Display image, captions, and results in organized layout with proper formatting
+            col1, col2 = st.columns([1, 2])
+            with col1:
+                # Display original image and overlay side by side with controlled size
+                st.image(st.session_state.current_image, caption="Original Image", width=300)
+                if hasattr(st.session_state, 'current_overlay'):
+                    st.image(st.session_state.current_overlay, caption="GradCAM Overlay", width=300)
+            with col2:
+                # Detection result
+                if hasattr(st.session_state, 'current_pred_label'):
+                    st.markdown("### Detection Result")
+                    st.markdown(f"**Classification:** {st.session_state.current_pred_label} (Confidence: {st.session_state.current_confidence:.2%})")
+                    st.markdown("---")
+                # Image description
+                if hasattr(st.session_state, 'image_caption'):
+                    st.markdown("### Image Description")
+                    st.markdown(st.session_state.image_caption)
+                    st.markdown("---")
+                # GradCAM analysis
+                if hasattr(st.session_state, 'gradcam_caption'):
+                    st.markdown("### GradCAM Analysis")
+                    st.markdown(st.session_state.gradcam_caption)
+    # LLM Analysis section - AFTER Image Analysis Summary
     with st.expander("Stage 3: Detailed Analysis with Vision LLM", expanded=False):
         if hasattr(st.session_state, 'current_image') and st.session_state.llm_model_loaded:
             st.subheader("Detailed Deepfake Analysis")
+            # Display chat history
+            for i, (question, answer) in enumerate(st.session_state.chat_history):
+                st.markdown(f"**Question {i+1}:** {question}")
+                st.markdown(f"**Answer:** {answer}")
+                st.markdown("---")
             # Include both captions in the prompt if available
             caption_text = ""
             if hasattr(st.session_state, 'image_caption'):
                 caption_text += f"\n\nGradCAM Analysis:\n{st.session_state.gradcam_caption}"
             # Default question with option to customize
+            default_question = f"This image has been classified as {st.session_state.current_pred_label}. Analyze the key features that led to this classification, focusing on the highlighted areas in the GradCAM visualization. Provide both a technical explanation for experts and a simple explanation for non-technical users."
+            # User input for new question
+            new_question = st.text_area("Ask a question about the image:", value=default_question if not st.session_state.chat_history else "", height=100)
+            # Analyze button and Clear Chat button in the same row
+            col1, col2 = st.columns([3, 1])
+            with col1:
+                analyze_button = st.button("🔍 Send Question", type="primary")
+            with col2:
+                clear_button = st.button("🗑️ Clear Chat History")
+            if clear_button:
+                st.session_state.chat_history = []
+                st.experimental_rerun()
+            if analyze_button and new_question:
                 try:
+                    # Add caption info if it's the first question
+                    if not st.session_state.chat_history:
+                        full_question = new_question + caption_text
+                    else:
+                        full_question = new_question
                     result = analyze_image_with_llm(
                         st.session_state.current_image,
                         st.session_state.current_overlay,
                         st.session_state.current_face_box,
                         st.session_state.current_pred_label,
                         st.session_state.current_confidence,
+                        full_question,
                         st.session_state.llm_model,
                         st.session_state.tokenizer,
                         temperature=temperature,
                         custom_instruction=custom_instruction
                     )
+                    # Add to chat history
+                    st.session_state.chat_history.append((new_question, result))
+                    # Display the latest result too
                     st.success("✅ Analysis complete!")
                     # Check if the result contains both technical and non-technical explanations
                             non_technical = "Non-Technical" + parts[1]
                             # Display in two columns
+                            tech_col, simple_col = st.columns(2)
+                            with tech_col:
                                 st.subheader("Technical Analysis")
                                 st.markdown(technical)
+                            with simple_col:
                                 st.subheader("Simple Explanation")
                                 st.markdown(non_technical)
                         except Exception as e:
                         # Just display the whole result
                         st.subheader("Analysis Result")
                         st.markdown(result)
+                    # Rerun to update the chat history display
+                    st.experimental_rerun()
                 except Exception as e:
                     st.error(f"Error during LLM analysis: {str(e)}")
         else:
             st.warning("⚠️ Please load the Vision LLM to perform detailed analysis.")
     # Footer
     st.markdown("---")
     st.caption("Advanced Deepfake Image Analyzer with Structured BLIP Captioning")