Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Sleeping

App Files Files Community

SaniaE commited on May 5

Commit

8b9f879

verified ·

1 Parent(s): 4debe0a

updated endpoint logic

Browse files

Files changed (1) hide show

app.py +73 -57

app.py CHANGED Viewed

@@ -83,94 +83,110 @@ def _generate_sync(m_name, image, temp=0.7):
 # --- Endpoint 1: The Multi-Perspective Generator ---
-@app.post("/generate-caption")
-async def generate_caption(file: UploadFile = File(...), temp: float = Query(0.7)):
-    image_bytes = await file.read()
-    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-    # Run both architectures in parallel
-    tasks = [
-        asyncio.to_thread(_generate_sync, "blip", image, temp),
-        asyncio.to_thread(_generate_sync, "vit", image, temp)
-    ]
     captions = await asyncio.gather(*tasks)
-    return {
-        "blip_caption": captions[0],
-        "vit_git_caption": captions[1]
-    }
-# --- Endpoint 2: The Saliency Explorer (XAI Glow) ---
-@app.post("/saliency-explorer")
-async def get_saliency_map(file: UploadFile = File(...), query_text: str = Query(...)):
     image_bytes = await file.read()
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
-    inputs = blip["processor"](images=orig_img, text=query_text, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
-        vision_hidden = blip["model"].vision_model(inputs.pixel_values).last_hidden_state
-        outputs = blip["model"].text_decoder(
-            input_ids=inputs.input_ids,
-            attention_mask=inputs.attention_mask,
-            encoder_hidden_states=vision_hidden,
             output_attentions=True
         )
-        # Slicing out the [CLS] token from cross-attentions
-        cross_attentions = outputs.cross_attentions[-1]
-        mask_1d = cross_attentions[0, :, 1:-1, 1:].mean(dim=(0, 1))
         grid_size = int(np.sqrt(mask_1d.shape[-1]))
         mask = mask_1d.view(grid_size, grid_size).cpu().numpy()
-    # Normalization & XAI Glow Application
     mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
     mask_pill = Image.fromarray((mask * 255).astype('uint8')).resize(orig_img.size, resample=Image.BICUBIC)
-    mask_pill = mask_pill.filter(ImageFilter.GaussianBlur(radius=12))
-    heatmap_rgba = plt.get_cmap('jet')(np.array(mask_pill)/255.0)
     heatmap_img = Image.fromarray((heatmap_rgba[:, :, :3] * 255).astype('uint8')).convert("RGB")
-    blended_img = Image.blend(orig_img, heatmap_img, alpha=0.5)
     buf = io.BytesIO()
     blended_img.save(buf, format="PNG")
     buf.seek(0)
     return StreamingResponse(buf, media_type="image/png")
 # --- Endpoint 3: Internal Debate (Audit Mode) ---
-@app.post("/internal-debate")
-async def internal_debate(file: UploadFile = File(...), user_prompt: str = Query(...)):
-    image_bytes = await file.read()
-    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-    # 1. Gather model perceptions
-    blip_caption = await asyncio.to_thread(_generate_sync, "blip", image)
-    vit_caption = await asyncio.to_thread(_generate_sync, "vit", image)
-    # 2. Semantic Embedding Logic
-    blip_data = MODELS["blip"]
-    def get_emb(text):
-        inputs = blip_data["processor"](text=text, return_tensors="pt", padding=True).to(DEVICE)
         with torch.no_grad():
-            return F.normalize(blip_data["model"].text_decoder.bert(**inputs).last_hidden_state.mean(dim=1), p=2, dim=-1)
-    u_emb = get_emb(user_prompt)
-    b_emb = get_emb(blip_caption)
-    v_emb = get_emb(vit_caption)
-    # 3. MLE Calibration (Jaccard Weighting)
-    def calibrate(emb1, emb2, t1, t2):
-        s1, s2 = set(t1.lower().split()), set(t2.lower().split())
-        jaccard = len(s1 & s2) / len(s1 | s2) if s1 | s2 else 0
-        cosine = torch.matmul(emb1, emb2.T).item()
-        return (cosine * 0.4) + (jaccard * 0.6)
-    score_blip = calibrate(u_emb, b_emb, user_prompt, blip_caption)
-    score_vit = calibrate(u_emb, v_emb, user_prompt, vit_caption)
-    consensus = calibrate(b_emb, v_emb, blip_caption, vit_caption)
     return {
         "perspectives": {
@@ -179,9 +195,9 @@ async def internal_debate(file: UploadFile = File(...), user_prompt: str = Query
             "vit_git_view": vit_caption
         },
         "audit_metrics": {
-            "user_vs_blip": round(score_blip, 4),
-            "user_vs_vit": round(score_vit, 4),
             "inter_model_consensus": round(consensus, 4)
         },
-        "verdict": "Consensus" if consensus > 0.65 else "Perspective Divergence"
     }

 # --- Endpoint 1: The Multi-Perspective Generator ---
+@app.post("/generate")
+async def generate_endpoint(
+    file: UploadFile = File(...),
+    temp: float = Query(0.8),
+    top_k: int = Query(50),
+    top_p: float = Query(0.9)
+):
+    image = Image.open(file.file).convert("RGB")
+    available = ["blip", "vit"]
+    # Generate 5 captions using a mix of models
+    model_selection = random.choices(available, k=5)
+    tasks = [asyncio.to_thread(_generate_sync, m, image, temp, top_k, top_p) for m in model_selection]
     captions = await asyncio.gather(*tasks)
+    return {"captions": captions, "architectures": model_selection}
+# --- Endpoint 2: Objective Vision Saliency (Static Image Perception) ---
+@app.post("/saliency-explorer/vision")
+async def get_objective_saliency(file: UploadFile = File(...)):
     image_bytes = await file.read()
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
+    inputs = blip["processor"](images=orig_img, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
+        # Capturing Self-Attention from the Vision Encoder itself
+        # This shows what the model finds interesting in the image, regardless of prompt
+        outputs = blip["model"].vision_model(
+            inputs.pixel_values,
             output_attentions=True
         )
+        # Last layer attention: (batch, heads, patches, patches)
+        attentions = outputs.attentions[-1]
+        # Average across heads and focus on CLS token's view of the patches
+        # Patch grid for BLIP-Large is typically 24x24 (576 patches + 1 CLS)
+        nh = attentions.shape[1]
+        attentional_map = attentions[0, :, 0, 1:].reshape(nh, -1)
+        mask_1d = attentional_map.mean(dim=0)
         grid_size = int(np.sqrt(mask_1d.shape[-1]))
         mask = mask_1d.view(grid_size, grid_size).cpu().numpy()
+    # Normalization and High-Contrast "Heat"
     mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
     mask_pill = Image.fromarray((mask * 255).astype('uint8')).resize(orig_img.size, resample=Image.BICUBIC)
+    mask_pill = mask_pill.filter(ImageFilter.GaussianBlur(radius=10))
+    heatmap_rgba = plt.get_cmap('magma')(np.array(mask_pill)/255.0)
     heatmap_img = Image.fromarray((heatmap_rgba[:, :, :3] * 255).astype('uint8')).convert("RGB")
+    # Blending at 0.6 alpha to make the "Model's Focus" pop
+    blended_img = Image.blend(orig_img, heatmap_img, alpha=0.6)
     buf = io.BytesIO()
     blended_img.save(buf, format="PNG")
     buf.seek(0)
     return StreamingResponse(buf, media_type="image/png")
+# --- Endpoint 3: Perspective Auditor (Internal Debate) ---
 # --- Endpoint 3: Internal Debate (Audit Mode) ---
+@app.post("/audit-perspective")
+async def audit_perspective(file: UploadFile = File(...), user_prompt: str = Query(...)):
+    image = Image.open(file.file).convert("RGB")
+    # Run both models to get the "Internal Debate"
+    blip_caption = await asyncio.to_thread(_generate_sync, "blip", image, 0.7, 50, 0.9)
+    vit_caption = await asyncio.to_thread(_generate_sync, "vit", image, 0.7, 50, 0.9)
+    def get_metrics(target, reference):
+        # 1. Semantic Embedding (The "Vibe" check)
+        blip = MODELS["blip"]
+        t_in = blip["processor"](text=target, return_tensors="pt", padding=True).to(DEVICE)
+        r_in = blip["processor"](text=reference, return_tensors="pt", padding=True).to(DEVICE)
         with torch.no_grad():
+            t_emb = F.normalize(blip["model"].text_decoder.bert(**t_in).last_hidden_state.mean(dim=1), p=2, dim=-1)
+            r_emb = F.normalize(blip["model"].text_decoder.bert(**r_in).last_hidden_state.mean(dim=1), p=2, dim=-1)
+        cosine_sim = torch.matmul(t_emb, r_emb.T).item()
+        # 2. Jaccard Calibration (The "Accuracy" check - 70% weight)
+        t_words = set(target.lower().replace(",", "").split())
+        r_words = set(reference.lower().replace(",", "").split())
+        jaccard = len(t_words & r_words) / len(t_words | r_words) if t_words | r_words else 0
+        return (cosine_sim * 0.3) + (jaccard * 0.7)
+    user_vs_blip = get_metrics(user_prompt, blip_caption)
+    user_vs_vit = get_metrics(user_prompt, vit_caption)
+    consensus = get_metrics(blip_caption, vit_caption)
+    # XAI Verdict Logic
+    if consensus < 0.5:
+        verdict = "Model Confusion: High Uncertainty"
+    elif user_vs_blip < 0.6:
+        verdict = "Perspective Divergence: Prompt Mismatch"
+    else:
+        verdict = "Verified: Strong Alignment"
     return {
         "perspectives": {
             "vit_git_view": vit_caption
         },
         "audit_metrics": {
+            "user_vs_blip": round(user_vs_blip, 4),
+            "user_vs_vit": round(user_vs_vit, 4),
             "inter_model_consensus": round(consensus, 4)
         },
+        "verdict": verdict
     }