Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Running

App Files Files Community

SaniaE commited on 21 days ago

Commit

c441112

verified ·

1 Parent(s): 11cdb23

updated saliency map logic

Browse files

Files changed (1) hide show

app.py +26 -17

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import torch
 import random
 import asyncio
-from PIL import Image
 from fastapi import FastAPI, UploadFile, File, Query
 from fastapi.middleware.cors import CORSMiddleware
 from huggingface_hub import snapshot_download, login
@@ -171,16 +171,18 @@ async def concept_ensemble(file: UploadFile = File(...), user_prompt: str = Quer
     inputs_text = blip["processor"](text=texts, return_tensors="pt", padding=True).to(DEVICE)
     with torch.no_grad():
-        # Get pooled text and vision features
-        text_outputs = blip["model"].text_encoder(**inputs_text)
-        text_embeds = text_outputs.last_hidden_state[:, 0, :] # Use [CLS] token
         vision_outputs = blip["model"].vision_model(inputs_gen["pixel_values"])
-        image_embeds = vision_outputs.last_hidden_state[:, 0, :]
-        # Normalize vectors for cosine similarity
-        text_embeds = F.normalize(text_embeds, p=2, dim=-1)
         image_embeds = F.normalize(image_embeds, p=2, dim=-1)
         # Similarity Matrix calculation
         sim_image_user = torch.matmul(image_embeds, text_embeds[0].T).item()
@@ -207,32 +209,39 @@ async def get_saliency_heatmap(file: UploadFile = File(...), query_text: str = Q
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
     inputs = blip["processor"](images=orig_img, text=query_text, return_tensors="pt").to(DEVICE)
     # 2. Extract Gradients for Saliency
-    inputs.pixel_values.requires_grad = True
     outputs = blip["model"](**inputs, labels=inputs["input_ids"])
     loss = outputs.loss
     loss.backward()
-    # Get max gradient across channels
     grad = inputs.pixel_values.grad.abs().max(dim=1)[0][0].cpu().numpy()
-    # 3. Create Heatmap with Matplotlib
     # Normalize to [0, 1]
     grad = (grad - grad.min()) / (grad.max() - grad.min() + 1e-8)
-    # Apply color map (jet) and convert to RGBA
     cm = plt.get_cmap('jet')
-    heatmap_rgba = cm(grad) # This creates an NxMx4 array
-    # Convert heatmap to PIL Image and resize to match original
     heatmap_img = Image.fromarray((heatmap_rgba[:, :, :3] * 255).astype('uint8')).convert("RGB")
     heatmap_img = heatmap_img.resize(orig_img.size, resample=Image.BILINEAR)
-    # 4. Blend Original + Heatmap
-    # 0.6 alpha for original, 0.4 for heatmap
-    blended_img = Image.blend(orig_img, heatmap_img, alpha=0.4)
     # 5. Stream back
     buf = io.BytesIO()

 import torch
 import random
 import asyncio
+from PIL import Image, ImageFilter
 from fastapi import FastAPI, UploadFile, File, Query
 from fastapi.middleware.cors import CORSMiddleware
 from huggingface_hub import snapshot_download, login
     inputs_text = blip["processor"](text=texts, return_tensors="pt", padding=True).to(DEVICE)
     with torch.no_grad():
+        # 1. Get Image Embeddings from the vision_model
         vision_outputs = blip["model"].vision_model(inputs_gen["pixel_values"])
+        image_embeds = vision_outputs.last_hidden_state[:, 0, :] # Use [CLS] token
+        # 2. Get Text Embeddings using the text_decoder's bert model
+        # BLIP's text_decoder typically wraps a BERT-like architecture
+        text_outputs = blip["model"].text_decoder.bert(**inputs_text)
+        text_embeds = text_outputs.last_hidden_state[:, 0, :] # Use [CLS] token
+        # Normalize
         image_embeds = F.normalize(image_embeds, p=2, dim=-1)
+        text_embeds = F.normalize(text_embeds, p=2, dim=-1)
         # Similarity Matrix calculation
         sim_image_user = torch.matmul(image_embeds, text_embeds[0].T).item()
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
+    # Ensure pixel_values can track gradients
     inputs = blip["processor"](images=orig_img, text=query_text, return_tensors="pt").to(DEVICE)
+    inputs.pixel_values.requires_grad = True
     # 2. Extract Gradients for Saliency
     outputs = blip["model"](**inputs, labels=inputs["input_ids"])
     loss = outputs.loss
     loss.backward()
+    # Generate Saliency from gradients of pixel values
+    # We take the maximum absolute gradient across the RGB channels
     grad = inputs.pixel_values.grad.abs().max(dim=1)[0][0].cpu().numpy()
+    # 3. Create Heatmap with "Glow" Effect (XAI Style)
     # Normalize to [0, 1]
     grad = (grad - grad.min()) / (grad.max() - grad.min() + 1e-8)
+    # Apply Gaussian Blur to smooth tiny speckles into a professional heatmap
+    grad_pill = Image.fromarray((grad * 255).astype('uint8'))
+    grad_pill = grad_pill.filter(ImageFilter.GaussianBlur(radius=8))
+    grad_smoothed = np.array(grad_pill) / 255.0
+    # Apply colormap (jet)
     cm = plt.get_cmap('jet')
+    heatmap_rgba = cm(grad_smoothed)
+    # Convert heatmap to PIL and resize to original image dimensions
     heatmap_img = Image.fromarray((heatmap_rgba[:, :, :3] * 255).astype('uint8')).convert("RGB")
     heatmap_img = heatmap_img.resize(orig_img.size, resample=Image.BILINEAR)
+    # 4. Blend Original + Heatmap (Adjust alpha for visibility on dark/light UIs)
+    # 0.5 alpha provides a strong clear highlight for the "Rorompok" sofa
+    blended_img = Image.blend(orig_img, heatmap_img, alpha=0.5)
     # 5. Stream back
     buf = io.BytesIO()