Spaces:

saakshigupta
/

deepfake-explainer-app

Paused

App Files Files Community

saakshigupta commited on Apr 6

Commit

cd7498a

verified ·

1 Parent(s): 2bc3c60

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -44

app.py CHANGED Viewed

@@ -424,32 +424,32 @@ def process_image_with_gradcam(image, model, device, pred_class):
 # ----- BLIP Image Captioning -----
-# Define custom prompts for original and GradCAM images - simpler prompts that work better with BLIP
-ORIGINAL_IMAGE_PROMPT = "Detailed description:"
-GRADCAM_IMAGE_PROMPT = "Describe this heatmap visualization:"
-# Function to generate image caption with structured formatting
 def generate_image_caption(image, processor, model, is_gradcam=False, max_length=150, num_beams=5):
     """
     Generate a caption for the input image using BLIP model and format it with structured headings
-    Args:
-        image (PIL.Image): Input image
-        processor: BLIP processor
-        model: BLIP model
-        is_gradcam (bool): Whether the image is a GradCAM visualization
-        max_length (int): Maximum length of the caption
-        num_beams (int): Number of beams for beam search
-    Returns:
-        str: Generated caption with structured formatting
     """
     try:
         # Select the appropriate prompt based on image type
         prompt = GRADCAM_IMAGE_PROMPT if is_gradcam else ORIGINAL_IMAGE_PROMPT
-        # Preprocess the image with the basic prompt
         inputs = processor(image, text=prompt, return_tensors="pt")
         # Check for available GPU
@@ -464,11 +464,7 @@ def generate_image_caption(image, processor, model, is_gradcam=False, max_length
         # Decode the caption
         raw_caption = processor.decode(output[0], skip_special_tokens=True)
-        # Remove the prompt if it appears in the caption
-        if prompt in raw_caption:
-            raw_caption = raw_caption.replace(prompt, "").strip()
-        # Format the caption with proper structure based on type
         if is_gradcam:
             formatted_caption = format_gradcam_caption(raw_caption)
         else:
@@ -481,17 +477,21 @@ def generate_image_caption(image, processor, model, is_gradcam=False, max_length
 def format_image_caption(raw_caption):
     """Format a raw caption into a structured description with headings"""
-    # Basic structure for image caption
     structured_caption = f"""
-**Subject**: The image shows a person, likely in a portrait or headshot format.
-**Appearance**: {raw_caption}
-**Background**: The background appears to be a studio or controlled environment setting.
-**Lighting**: The lighting appears to be professional with even illumination on the subject's face.
-**Colors**: The image contains a range of tones typical in portrait photography.
 **Notable Elements**: The facial features and expression are the central focus of the image.
 """
@@ -499,32 +499,21 @@ def format_image_caption(raw_caption):
 def format_gradcam_caption(raw_caption):
     """Format a raw GradCAM description with proper structure"""
     # Basic structure for GradCAM analysis
     structured_caption = f"""
-**Main Focus Area**: The heatmap is primarily focused on the facial region.
-**High Activation Regions**: The red/yellow areas highlight {raw_caption}
-**Medium Activation Regions**: The green/cyan areas correspond to medium importance features in the image.
-**Low Activation Regions**: The blue/dark blue areas represent features that have less impact on the model's decision.
-**Activation Pattern**: The overall pattern suggests the model is focusing on key facial features to make its determination.
 """
     return structured_caption.strip()
-# Function to load BLIP captioning model
-@st.cache_resource
-def load_blip_model():
-    with st.spinner("Loading BLIP captioning model..."):
-        try:
-            processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-            model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
-            return processor, model
-        except Exception as e:
-            st.error(f"Error loading BLIP model: {str(e)}")
-            return None, None
 # ----- Fine-tuned Vision LLM -----
 # Function to fix cross-attention masks

 # ----- BLIP Image Captioning -----
+# Define simple prompts for BLIP
+ORIGINAL_IMAGE_PROMPT = ""  # Empty prompt for original images - BLIP works better with no prompt
+GRADCAM_IMAGE_PROMPT = "Describe what you see in this heatmap visualization"
+# Function to load BLIP captioning model
+@st.cache_resource
+def load_blip_model():
+    with st.spinner("Loading BLIP captioning model..."):
+        try:
+            processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+            model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+            return processor, model
+        except Exception as e:
+            st.error(f"Error loading BLIP model: {str(e)}")
+            return None, None
+# Function to generate image caption with manual structured formatting
 def generate_image_caption(image, processor, model, is_gradcam=False, max_length=150, num_beams=5):
     """
     Generate a caption for the input image using BLIP model and format it with structured headings
     """
     try:
         # Select the appropriate prompt based on image type
         prompt = GRADCAM_IMAGE_PROMPT if is_gradcam else ORIGINAL_IMAGE_PROMPT
+        # Preprocess the image
         inputs = processor(image, text=prompt, return_tensors="pt")
         # Check for available GPU
         # Decode the caption
         raw_caption = processor.decode(output[0], skip_special_tokens=True)
+        # Format the caption into a structured format based on type
         if is_gradcam:
             formatted_caption = format_gradcam_caption(raw_caption)
         else:
 def format_image_caption(raw_caption):
     """Format a raw caption into a structured description with headings"""
+    # Try to extract some basic information from the raw caption
+    appearance_info = raw_caption  # Use the full caption by default
+    # Basic structure for image caption with extracted information
     structured_caption = f"""
+**Subject**: The image shows a person in a portrait-style photograph.
+**Appearance**: {appearance_info}
+**Background**: The background appears to be a controlled environment.
+**Lighting**: The lighting appears to be professional with even illumination.
+**Colors**: The image contains natural skin tones and colors typical of portrait photography.
 **Notable Elements**: The facial features and expression are the central focus of the image.
 """
 def format_gradcam_caption(raw_caption):
     """Format a raw GradCAM description with proper structure"""
     # Basic structure for GradCAM analysis
     structured_caption = f"""
+**Main Focus Area**: The heatmap is primarily focused on the facial region of the person.
+**High Activation Regions**: The red/yellow areas highlight important features that the model is focusing on. {raw_caption}
+**Medium Activation Regions**: The green/cyan areas correspond to regions of medium importance in the detection process, typically including parts of the face and surrounding areas.
+**Low Activation Regions**: The blue/dark blue areas represent features that have less impact on the model's decision, usually the background and peripheral elements.
+**Activation Pattern**: The overall pattern suggests the model is primarily analyzing facial features to make its determination of authenticity.
 """
     return structured_caption.strip()
 # ----- Fine-tuned Vision LLM -----
 # Function to fix cross-attention masks