Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Sleeping

App Files Files Community

SaniaE commited on 15 days ago

Commit

7b80a15

verified ·

1 Parent(s): 5f1d4a9

added cross-attention saliency

Browse files

Files changed (1) hide show

app.py +18 -14

app.py CHANGED Viewed

@@ -212,29 +212,33 @@ async def get_saliency_heatmap(file: UploadFile = File(...), query_text: str = Q
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
-    # We must explicitly call the vision_model to get the attentions cleanly
     inputs = blip["processor"](images=orig_img, text=query_text, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
-        # Get vision outputs specifically to access the self-attention maps
-        vision_outputs = blip["model"].vision_model(
-            pixel_values=inputs.pixel_values,
-            output_attentions=True
         )
-        # Access attentions from the vision model output
-        # Shape: (layers, batch, heads, patches, patches)
-        attentions = vision_outputs.attentions[-1]
-        # Grid size (usually 16x16 for BLIP)
-        grid_size = int(np.sqrt(attentions.shape[-1] - 1))
-        # Take attention from the [CLS] token (index 0) to all other patches
-        mask = attentions[0, :, 0, 1:].mean(0).view(grid_size, grid_size).cpu().numpy()
-    # Normalize, upscale, and blur for that "Pinterest-chic" glow
     mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
     mask_pill = Image.fromarray((mask * 255).astype('uint8')).resize(orig_img.size, resample=Image.BICUBIC)
-    mask_pill = mask_pill.filter(ImageFilter.GaussianBlur(radius=12))
     heatmap_rgba = plt.get_cmap('jet')(np.array(mask_pill)/255.0)
     heatmap_img = Image.fromarray((heatmap_rgba[:, :, :3] * 255).astype('uint8')).convert("RGB")

     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
     inputs = blip["processor"](images=orig_img, text=query_text, return_tensors="pt").to(DEVICE)
+    # We use the text_decoder because that's where the image and text actually 'meet'
     with torch.no_grad():
+        outputs = blip["model"].text_decoder(
+            input_ids=inputs.input_ids,
+            attention_mask=inputs.attention_mask,
+            encoder_hidden_states=blip["model"].vision_model(inputs.pixel_values).last_hidden_state,
+            output_attentions=True # This is key
         )
+        # Get Cross-Attentions (the link between text and image)
+        # Shape: (layers, batch, heads, text_tokens, image_patches)
+        cross_attentions = outputs.cross_attentions[-1]
+        # Average across heads and text tokens to get a single 1D map of image importance
+        # We exclude the first and last text tokens ([CLS], [SEP])
+        mask_1d = cross_attentions[0, :, 1:-1, :].mean(dim=(0, 1))
+        # Reshape to the grid (usually 16x16 for BLIP-large)
+        grid_size = int(np.sqrt(mask_1d.shape[-1]))
+        mask = mask_1d.view(grid_size, grid_size).cpu().numpy()
+    # Normalize and create the "Glow"
     mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
     mask_pill = Image.fromarray((mask * 255).astype('uint8')).resize(orig_img.size, resample=Image.BICUBIC)
+    mask_pill = mask_pill.filter(ImageFilter.GaussianBlur(radius=12)) # The XAI Glow
     heatmap_rgba = plt.get_cmap('jet')(np.array(mask_pill)/255.0)
     heatmap_img = Image.fromarray((heatmap_rgba[:, :, :3] * 255).astype('uint8')).convert("RGB")