Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Sleeping

App Files Files Community

SaniaE commited on 18 days ago

Commit

40e2d6d

verified ·

1 Parent(s): edccc41

added more endpoints

Browse files

Files changed (1) hide show

app.py +124 -1

app.py CHANGED Viewed

@@ -10,6 +10,12 @@ from transformers import (
     BlipProcessor, BlipForConditionalGeneration,
     ViTImageProcessor, AutoProcessor, AutoModelForCausalLM
 )
 app = FastAPI()
@@ -147,4 +153,121 @@ async def ui_tester(file: UploadFile = File(...), description: str = Query(...))
         },
         "status": "Match Found" if confidence_score > 55 else "Partial Match" if confidence_score > 30 else "No Match",
         "is_valid": confidence_score > 55
-    }

     BlipProcessor, BlipForConditionalGeneration,
     ViTImageProcessor, AutoProcessor, AutoModelForCausalLM
 )
+import torch.nn.functional as F
+import numpy as np
+import cv2
+import io
+from fastapi.responses import StreamingResponse
 app = FastAPI()
         },
         "status": "Match Found" if confidence_score > 55 else "Partial Match" if confidence_score > 30 else "No Match",
         "is_valid": confidence_score > 55
+    }
+@app.post("/saliency-explorer")
+async def saliency_explorer(file: UploadFile = File(...), query_text: str = Query(...)):
+    image = Image.open(file.file).convert("RGB")
+    blip = MODELS["blip"]
+    # Process inputs
+    inputs = blip["processor"](images=image, text=query_text, return_tensors="pt").to(DEVICE)
+    inputs.requires_grad = True # Enable gradients for saliency mapping
+    # Forward pass through the vision-language projector
+    outputs = blip["model"](**inputs, labels=inputs["input_ids"])
+    loss = outputs.loss
+    loss.backward()
+    # Extract gradients from the vision encoder's last layer
+    # Note: Using the last hidden state as a proxy for spatial importance
+    gradients = blip["model"].vision_model.embeddings.patch_embedding.weight.grad
+    pooled_gradients = torch.mean(gradients, dim=[0, 2, 3])
+    # Generate heatmap
+    # In a real implementation, you would use Grad-CAM on the attention layers
+    # Here we simplify the spatial mapping for the demo response
+    heatmap = torch.mean(torch.abs(gradients), dim=1).squeeze().cpu().numpy()
+    heatmap = cv2.resize(heatmap, (image.size[0], image.size[1]))
+    heatmap = (heatmap - heatmap.min()) / (heatmap.max() - heatmap.min())
+    return {
+        "query": query_text,
+        "heatmap_data": heatmap.tolist(), # Send to frontend to overlay with CSS/Canvas
+        "explanation": f"Highlighted regions show where the model focused to validate '{query_text}'"
+    }
+@app.post("/concept-ensemble")
+async def concept_ensemble(file: UploadFile = File(...), user_prompt: str = Query(...)):
+    image = Image.open(file.file).convert("RGB")
+    blip = MODELS["blip"]
+    # 1. Get Model's Perceived Caption (Baseline)
+    inputs_gen = blip["processor"](images=image, return_tensors="pt").to(DEVICE)
+    generated_ids = blip["model"].generate(**inputs_gen, max_length=40)
+    model_caption = blip["processor"].decode(generated_ids[0], skip_special_tokens=True)
+    # 2. Generate Embeddings for the Matrix
+    # We compare User Prompt, Model Caption, and a 'Ground Truth' Visual Vector
+    texts = [user_prompt, model_caption]
+    inputs_text = blip["processor"](text=texts, return_tensors="pt", padding=True).to(DEVICE)
+    with torch.no_grad():
+        # Get text features
+        text_embeds = blip["model"].text_encoder(**inputs_text).last_hidden_state[:, 0, :]
+        # Get image features
+        image_embeds = blip["model"].vision_model(inputs_gen["pixel_values"]).last_hidden_state[:, 0, :]
+        # Normalize for Cosine Similarity
+        text_embeds = F.normalize(text_embeds, p=2, dim=-1)
+        image_embeds = F.normalize(image_embeds, p=2, dim=-1)
+        # Calculate Matrix: [Image vs User, Image vs Model, User vs Model]
+        sim_image_user = torch.matmul(image_embeds, text_embeds[0].T).item()
+        sim_image_model = torch.matmul(image_embeds, text_embeds[1].T).item()
+        sim_user_model = torch.matmul(text_embeds[0], text_embeds[1].T).item()
+    return {
+        "captions": {
+            "user": user_prompt,
+            "model": model_caption
+        },
+        "similarity_matrix": {
+            "visual_alignment_user": round(sim_image_user, 4),
+            "visual_alignment_model": round(sim_image_model, 4),
+            "semantic_overlap": round(sim_user_model, 4)
+        },
+        "ensemble_verdict": "Consensus" if sim_user_model > 0.8 else "Perspective Divergence"
+    }
+@app.post("/saliency-explorer/image")
+async def get_saliency_heatmap(file: UploadFile = File(...), query_text: str = Query(...)):
+    # 1. Load and process image
+    contents = await file.read()
+    nparr = np.frombuffer(contents, np.uint8)
+    orig_img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+    image_rgb = cv2.cvtColor(orig_img, cv2.COLOR_BGR2RGB)
+    pil_img = Image.fromarray(image_rgb)
+    blip = MODELS["blip"]
+    inputs = blip["processor"](images=pil_img, text=query_text, return_tensors="pt").to(DEVICE)
+    # 2. Extract Attention/Gradients
+    # We target the cross-attention layer to see where the text 'queries' the image
+    inputs.pixel_values.requires_grad = True
+    outputs = blip["model"](**inputs, labels=inputs["input_ids"])
+    loss = outputs.loss
+    loss.backward()
+    # Generate Saliency from gradients
+    grad = inputs.pixel_values.grad.abs().max(dim=1)[0][0].cpu().numpy()
+    # 3. Create Heatmap Overlay
+    # Normalize gradients to 0-255
+    grad = (grad - grad.min()) / (grad.max() - grad.min() + 1e-8)
+    grad = (grad * 255).astype(np.uint8)
+    # Resize to original image size
+    heatmap = cv2.resize(grad, (orig_img.shape[1], orig_img.shape[0]))
+    # Apply Color Map (JET or VIRIDIS look very 'Pinterest-chic' / Pro)
+    heatmap_color = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
+    # Superimpose heatmap onto original image (0.6 original, 0.4 heatmap)
+    result_img = cv2.addWeighted(orig_img, 0.6, heatmap_color, 0.4, 0)
+    # 4. Stream the image back
+    res, im_png = cv2.imencode(".png", result_img)
+    return StreamingResponse(io.BytesIO(im_png.tobytes()), media_type="image/png")