Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Sleeping

App Files Files Community

SaniaE commited on 15 days ago

Commit

282248d

verified ·

1 Parent(s): 40e2d6d

updated endpoints

Browse files

Files changed (1) hide show

app.py +43 -73

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ import numpy as np
 import cv2
 import io
 from fastapi.responses import StreamingResponse
 app = FastAPI()
@@ -155,65 +156,34 @@ async def ui_tester(file: UploadFile = File(...), description: str = Query(...))
         "is_valid": confidence_score > 55
     }
-@app.post("/saliency-explorer")
-async def saliency_explorer(file: UploadFile = File(...), query_text: str = Query(...)):
-    image = Image.open(file.file).convert("RGB")
-    blip = MODELS["blip"]
-    # Process inputs
-    inputs = blip["processor"](images=image, text=query_text, return_tensors="pt").to(DEVICE)
-    inputs.requires_grad = True # Enable gradients for saliency mapping
-    # Forward pass through the vision-language projector
-    outputs = blip["model"](**inputs, labels=inputs["input_ids"])
-    loss = outputs.loss
-    loss.backward()
-    # Extract gradients from the vision encoder's last layer
-    # Note: Using the last hidden state as a proxy for spatial importance
-    gradients = blip["model"].vision_model.embeddings.patch_embedding.weight.grad
-    pooled_gradients = torch.mean(gradients, dim=[0, 2, 3])
-    # Generate heatmap
-    # In a real implementation, you would use Grad-CAM on the attention layers
-    # Here we simplify the spatial mapping for the demo response
-    heatmap = torch.mean(torch.abs(gradients), dim=1).squeeze().cpu().numpy()
-    heatmap = cv2.resize(heatmap, (image.size[0], image.size[1]))
-    heatmap = (heatmap - heatmap.min()) / (heatmap.max() - heatmap.min())
-    return {
-        "query": query_text,
-        "heatmap_data": heatmap.tolist(), # Send to frontend to overlay with CSS/Canvas
-        "explanation": f"Highlighted regions show where the model focused to validate '{query_text}'"
-    }
 @app.post("/concept-ensemble")
 async def concept_ensemble(file: UploadFile = File(...), user_prompt: str = Query(...)):
     image = Image.open(file.file).convert("RGB")
     blip = MODELS["blip"]
-    # 1. Get Model's Perceived Caption (Baseline)
     inputs_gen = blip["processor"](images=image, return_tensors="pt").to(DEVICE)
-    generated_ids = blip["model"].generate(**inputs_gen, max_length=40)
-    model_caption = blip["processor"].decode(generated_ids[0], skip_special_tokens=True)
-    # 2. Generate Embeddings for the Matrix
-    # We compare User Prompt, Model Caption, and a 'Ground Truth' Visual Vector
     texts = [user_prompt, model_caption]
     inputs_text = blip["processor"](text=texts, return_tensors="pt", padding=True).to(DEVICE)
     with torch.no_grad():
-        # Get text features
-        text_embeds = blip["model"].text_encoder(**inputs_text).last_hidden_state[:, 0, :]
-        # Get image features
-        image_embeds = blip["model"].vision_model(inputs_gen["pixel_values"]).last_hidden_state[:, 0, :]
-        # Normalize for Cosine Similarity
         text_embeds = F.normalize(text_embeds, p=2, dim=-1)
         image_embeds = F.normalize(image_embeds, p=2, dim=-1)
-        # Calculate Matrix: [Image vs User, Image vs Model, User vs Model]
         sim_image_user = torch.matmul(image_embeds, text_embeds[0].T).item()
         sim_image_model = torch.matmul(image_embeds, text_embeds[1].T).item()
         sim_user_model = torch.matmul(text_embeds[0], text_embeds[1].T).item()
@@ -221,53 +191,53 @@ async def concept_ensemble(file: UploadFile = File(...), user_prompt: str = Quer
     return {
         "captions": {
             "user": user_prompt,
-            "model": model_caption
         },
-        "similarity_matrix": {
-            "visual_alignment_user": round(sim_image_user, 4),
-            "visual_alignment_model": round(sim_image_model, 4),
-            "semantic_overlap": round(sim_user_model, 4)
         },
-        "ensemble_verdict": "Consensus" if sim_user_model > 0.8 else "Perspective Divergence"
     }
 @app.post("/saliency-explorer/image")
 async def get_saliency_heatmap(file: UploadFile = File(...), query_text: str = Query(...)):
-    # 1. Load and process image
-    contents = await file.read()
-    nparr = np.frombuffer(contents, np.uint8)
-    orig_img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
-    image_rgb = cv2.cvtColor(orig_img, cv2.COLOR_BGR2RGB)
-    pil_img = Image.fromarray(image_rgb)
     blip = MODELS["blip"]
-    inputs = blip["processor"](images=pil_img, text=query_text, return_tensors="pt").to(DEVICE)
-    # 2. Extract Attention/Gradients
-    # We target the cross-attention layer to see where the text 'queries' the image
     inputs.pixel_values.requires_grad = True
     outputs = blip["model"](**inputs, labels=inputs["input_ids"])
     loss = outputs.loss
     loss.backward()
-    # Generate Saliency from gradients
     grad = inputs.pixel_values.grad.abs().max(dim=1)[0][0].cpu().numpy()
-    # 3. Create Heatmap Overlay
-    # Normalize gradients to 0-255
     grad = (grad - grad.min()) / (grad.max() - grad.min() + 1e-8)
-    grad = (grad * 255).astype(np.uint8)
-    # Resize to original image size
-    heatmap = cv2.resize(grad, (orig_img.shape[1], orig_img.shape[0]))
-    # Apply Color Map (JET or VIRIDIS look very 'Pinterest-chic' / Pro)
-    heatmap_color = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
-    # Superimpose heatmap onto original image (0.6 original, 0.4 heatmap)
-    result_img = cv2.addWeighted(orig_img, 0.6, heatmap_color, 0.4, 0)
-    # 4. Stream the image back
-    res, im_png = cv2.imencode(".png", result_img)
-    return StreamingResponse(io.BytesIO(im_png.tobytes()), media_type="image/png")

 import cv2
 import io
 from fastapi.responses import StreamingResponse
+import matplotlib.pyplot as plt
 app = FastAPI()
         "is_valid": confidence_score > 55
     }
 @app.post("/concept-ensemble")
 async def concept_ensemble(file: UploadFile = File(...), user_prompt: str = Query(...)):
     image = Image.open(file.file).convert("RGB")
     blip = MODELS["blip"]
+    # 1. Model Baseline (Generating its own perception)
     inputs_gen = blip["processor"](images=image, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        generated_ids = blip["model"].generate(**inputs_gen, max_length=40)
+        model_caption = blip["processor"].decode(generated_ids[0], skip_special_tokens=True)
+    # 2. Embedding Calculation
     texts = [user_prompt, model_caption]
     inputs_text = blip["processor"](text=texts, return_tensors="pt", padding=True).to(DEVICE)
     with torch.no_grad():
+        # Get pooled text and vision features
+        text_outputs = blip["model"].text_encoder(**inputs_text)
+        text_embeds = text_outputs.last_hidden_state[:, 0, :] # Use [CLS] token
+        vision_outputs = blip["model"].vision_model(inputs_gen["pixel_values"])
+        image_embeds = vision_outputs.last_hidden_state[:, 0, :]
+        # Normalize vectors for cosine similarity
         text_embeds = F.normalize(text_embeds, p=2, dim=-1)
         image_embeds = F.normalize(image_embeds, p=2, dim=-1)
+        # Similarity Matrix calculation
         sim_image_user = torch.matmul(image_embeds, text_embeds[0].T).item()
         sim_image_model = torch.matmul(image_embeds, text_embeds[1].T).item()
         sim_user_model = torch.matmul(text_embeds[0], text_embeds[1].T).item()
     return {
         "captions": {
             "user": user_prompt,
+            "model_best_guess": model_caption
         },
+        "similarity_scores": {
+            "visual_alignment_user": round(float(sim_image_user), 4),
+            "visual_alignment_model": round(float(sim_image_model), 4),
+            "semantic_overlap": round(float(sim_user_model), 4)
         },
+        "interpretation": "Strong Agreement" if sim_user_model > 0.85 else "Diverse Perspectives"
     }
 @app.post("/saliency-explorer/image")
 async def get_saliency_heatmap(file: UploadFile = File(...), query_text: str = Query(...)):
+    # 1. Load Image
+    image_bytes = await file.read()
+    orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
+    inputs = blip["processor"](images=orig_img, text=query_text, return_tensors="pt").to(DEVICE)
+    # 2. Extract Gradients for Saliency
     inputs.pixel_values.requires_grad = True
     outputs = blip["model"](**inputs, labels=inputs["input_ids"])
     loss = outputs.loss
     loss.backward()
+    # Get max gradient across channels
     grad = inputs.pixel_values.grad.abs().max(dim=1)[0][0].cpu().numpy()
+    # 3. Create Heatmap with Matplotlib
+    # Normalize to [0, 1]
     grad = (grad - grad.min()) / (grad.max() - grad.min() + 1e-8)
+    # Apply color map (jet) and convert to RGBA
+    cm = plt.get_cmap('jet')
+    heatmap_rgba = cm(grad) # This creates an NxMx4 array
+    # Convert heatmap to PIL Image and resize to match original
+    heatmap_img = Image.fromarray((heatmap_rgba[:, :, :3] * 255).astype('uint8')).convert("RGB")
+    heatmap_img = heatmap_img.resize(orig_img.size, resample=Image.BILINEAR)
+    # 4. Blend Original + Heatmap
+    # 0.6 alpha for original, 0.4 for heatmap
+    blended_img = Image.blend(orig_img, heatmap_img, alpha=0.4)
+    # 5. Stream back
+    buf = io.BytesIO()
+    blended_img.save(buf, format="PNG")
+    buf.seek(0)
+    return StreamingResponse(buf, media_type="image/png")