Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Sleeping

App Files Files Community

SaniaE commited on 4 days ago

Commit

77a0b2f

verified ·

1 Parent(s): ace7c16

added optimizations

Browse files

Files changed (1) hide show

app.py +61 -39

app.py CHANGED Viewed

@@ -6,11 +6,11 @@ import random
 import numpy as np
 import torch
 import torch.nn.functional as F
-import matplotlib.pyplot as plt
-from PIL import Image, ImageFilter
 from fastapi import FastAPI, UploadFile, File, Query
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import StreamingResponse, Response
 from huggingface_hub import snapshot_download, login
 from transformers import (
@@ -21,14 +21,13 @@ from transformers import (
 app = FastAPI(title="XAI Auditor Ensemble with CLIP Jury")
-# Enable smooth frontend cross-origin header interceptions for performance metrics
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
-    expose_headers=["X-Processing-Time", "X-Audit-Time", "X-Grounding-Verdict"]
 )
 # --- Configuration & Paths ---
@@ -81,47 +80,63 @@ async def startup_event():
         )
     }
-    # 3. Load Fine-Tuned CLIP (Your Jury)
     cfg_c = MODEL_CONFIGS["clip"]
     MODELS["clip"] = {
         "model": CLIPModel.from_pretrained(os.path.join(local_dir, cfg_c["model_subfolder"])).to(DEVICE),
         "processor": CLIPProcessor.from_pretrained(os.path.join(local_dir, cfg_c["proc_subfolder"]))
     }
-    print("All models synchronized. Auditor is active.")
 # --- Utilities ---
-def _generate_sync(m_name, image, temp, top_k, top_p):
-    m_data = MODELS[m_name]
-    if m_name == "vit":
-        i_proc, t_proc = m_data["processor"]
-        inputs = i_proc(images=image, return_tensors="pt").to(DEVICE)
-        ids = m_data["model"].generate(**inputs, max_length=80, do_sample=True, temperature=temp, top_k=top_k, top_p=top_p)
-        return t_proc.batch_decode(ids, skip_special_tokens=True)[0].strip()
-    else:
-        proc = m_data["processor"]
-        inputs = proc(images=image, return_tensors="pt").to(DEVICE)
-        ids = m_data["model"].generate(**inputs, max_length=80, do_sample=True, temperature=temp, top_k=top_k, top_p=top_p)
-        return proc.batch_decode(ids, skip_special_tokens=True)[0].strip()
 # --- Endpoints ---
 @app.post("/generate")
 async def generate_captions(
     file: UploadFile = File(...),
-    temp: float = Query(0.8),
-    top_k: int = Query(50),
     top_p: float = Query(0.9)
 ):
     start_time = time.perf_counter()
     image = Image.open(file.file).convert("RGB")
     architectures = ["blip", "vit"]
     selection = random.choices(architectures, k=5)
-    # Offload generative sampling loop to a worker thread pool
-    tasks = [asyncio.to_thread(_generate_sync, m, image, temp, top_k, top_p) for m in selection]
-    captions = await asyncio.gather(*tasks)
     elapsed_time = time.perf_counter() - start_time
     print(f"[BENCHMARK] /generate ensemble turnaround: {elapsed_time:.4f}s")
@@ -137,6 +152,7 @@ async def generate_captions(
 @app.post("/saliency")
 async def get_vision_saliency(file: UploadFile = File(...)):
     start_time = time.perf_counter()
     image_bytes = await file.read()
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
@@ -151,37 +167,44 @@ async def get_vision_saliency(file: UploadFile = File(...)):
         grid_size = int(np.sqrt(mask_1d.shape[-1]))
         mask = mask_1d.view(grid_size, grid_size).cpu().numpy()
     mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
-    mask_img = Image.fromarray((mask * 255).astype('uint8')).resize(orig_img.size, resample=Image.BICUBIC)
-    mask_img = mask_img.filter(ImageFilter.GaussianBlur(radius=10))
-    heatmap = plt.get_cmap('magma')(np.array(mask_img)/255.0)
-    heatmap_img = Image.fromarray((heatmap[:, :, :3] * 255).astype('uint8')).convert("RGB")
-    blended = Image.blend(orig_img, heatmap_img, alpha=0.6)
     buf = io.BytesIO()
-    blended.save(buf, format="PNG")
     buf.seek(0)
     elapsed_time = time.perf_counter() - start_time
     print(f"[BENCHMARK] /saliency last-layer map turnaround: {elapsed_time:.4f}s")
-    return StreamingResponse(
-        buf,
-        media_type="image/png",
-        headers={"X-Processing-Time": f"{elapsed_time:.4f}"}
-    )
 @app.post("/audit")
 async def internal_debate_audit(file: UploadFile = File(...), user_prompt: str = Query(...)):
     start_time = time.perf_counter()
     image_bytes = await file.read()
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-    # 1. Model Perception
-    blip_caption = await asyncio.to_thread(_generate_sync, "blip", image, 0.7, 50, 0.9)
-    # 2. CLIP Scoring (Multimodal Alignment)
     clip_m = MODELS["clip"]["model"]
     clip_p = MODELS["clip"]["processor"]
@@ -193,7 +216,6 @@ async def internal_debate_audit(file: UploadFile = File(...), user_prompt: str =
     u_score, m_score = float(probs[0]), float(probs[1])
-    # 3. Decision Logic
     if u_score < 0.35:
         verdict = "Perspective Divergence: Intent not grounded in image."
     elif abs(u_score - m_score) < 0.15:

 import numpy as np
 import torch
 import torch.nn.functional as F
+import cv2
+from PIL import Image
 from fastapi import FastAPI, UploadFile, File, Query
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
 from huggingface_hub import snapshot_download, login
 from transformers import (
 app = FastAPI(title="XAI Auditor Ensemble with CLIP Jury")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
+    expose_headers=["X-Processing-Time", "X-Audit-Time"]
 )
 # --- Configuration & Paths ---
         )
     }
+    # 3. Load Fine-Tuned CLIP
     cfg_c = MODEL_CONFIGS["clip"]
     MODELS["clip"] = {
         "model": CLIPModel.from_pretrained(os.path.join(local_dir, cfg_c["model_subfolder"])).to(DEVICE),
         "processor": CLIPProcessor.from_pretrained(os.path.join(local_dir, cfg_c["proc_subfolder"]))
     }
+    print("All models synchronized. Ensemble backend is active.")
 # --- Utilities ---
+def _generate_sync_batch(selection, image, temp, top_k, top_p, max_len=45, do_sample=True):
+    """Processes generation sequentially to eliminate CPU context-switching overhead."""
+    captions = []
+    for m_name in selection:
+        m_data = MODELS[m_name]
+        if m_name == "vit":
+            i_proc, t_proc = m_data["processor"]
+            inputs = i_proc(images=image, return_tensors="pt").to(DEVICE)
+            # Cap max_new_tokens for snappier generation runtimes
+            ids = m_data["model"].generate(
+                **inputs, max_new_tokens=max_len, do_sample=do_sample,
+                temperature=temp if do_sample else None,
+                top_k=top_k if do_sample else None,
+                top_p=top_p if do_sample else None
+            )
+            caption = t_proc.batch_decode(ids, skip_special_tokens=True)[0].strip()
+        else:
+            proc = m_data["processor"]
+            inputs = proc(images=image, return_tensors="pt").to(DEVICE)
+            ids = m_data["model"].generate(
+                **inputs, max_new_tokens=max_len, do_sample=do_sample,
+                temperature=temp if do_sample else None,
+                top_k=top_k if do_sample else None,
+                top_p=top_p if do_sample else None
+            )
+            caption = proc.batch_decode(ids, skip_special_tokens=True)[0].strip()
+        captions.append(caption)
+    return captions
 # --- Endpoints ---
 @app.post("/generate")
 async def generate_captions(
     file: UploadFile = File(...),
+    temp: float = Query(0.7),
+    top_k: int = Query(40),
     top_p: float = Query(0.9)
 ):
+    """Generates 5 diverse captions using an optimized sequential pipeline."""
     start_time = time.perf_counter()
     image = Image.open(file.file).convert("RGB")
     architectures = ["blip", "vit"]
     selection = random.choices(architectures, k=5)
+    # Run loop sequentially inside a thread worker to safely dodge GIL contention
+    captions = await asyncio.to_thread(_generate_sync_batch, selection, image, temp, top_k, top_p, 45, True)
     elapsed_time = time.perf_counter() - start_time
     print(f"[BENCHMARK] /generate ensemble turnaround: {elapsed_time:.4f}s")
 @app.post("/saliency")
 async def get_vision_saliency(file: UploadFile = File(...)):
+    """Objective Saliency: Highly optimized native vision encoder self-attention mapping."""
     start_time = time.perf_counter()
     image_bytes = await file.read()
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
         grid_size = int(np.sqrt(mask_1d.shape[-1]))
         mask = mask_1d.view(grid_size, grid_size).cpu().numpy()
+    # Normalize attention matrix
     mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
+    # Vectorized OpenCV handling for super fast image processing
+    w, h = orig_img.size
+    mask_resized = cv2.resize(mask, (w, h), interpolation=cv2.INTER_CUBIC)
+    mask_blurred = cv2.GaussianBlur(mask_resized, (21, 21), 0)
+    # Convert normalized heatmap to standard color map space
+    heatmap_uint8 = np.uint8(255 * mask_blurred)
+    heatmap_bgr = cv2.applyColorMap(heatmap_uint8, cv2.COLORMAP_MAGMA)
+    heatmap_rgb = cv2.cvtColor(heatmap_bgr, cv2.COLOR_BGR2RGB)
+    # Composite overlay mix
+    orig_np = np.array(orig_img)
+    blended_np = cv2.addWeighted(orig_np, 0.5, heatmap_rgb, 0.5, 0)
+    blended_img = Image.fromarray(blended_np)
     buf = io.BytesIO()
+    blended_img.save(buf, format="PNG")
     buf.seek(0)
     elapsed_time = time.perf_counter() - start_time
     print(f"[BENCHMARK] /saliency last-layer map turnaround: {elapsed_time:.4f}s")
+    return StreamingResponse(buf, media_type="image/png")
 @app.post("/audit")
 async def internal_debate_audit(file: UploadFile = File(...), user_prompt: str = Query(...)):
+    """The CLIP-Powered Jury: Fast deterministic grounding verification track."""
     start_time = time.perf_counter()
     image_bytes = await file.read()
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    # OPTIMIZATION: Greedy decoding (do_sample=False) + short length constraint
+    blip_caption = (await asyncio.to_thread(_generate_sync_batch, ["blip"], image, 1.0, 1, 1.0, 25, False))[0]
+    # CLIP Scoring
     clip_m = MODELS["clip"]["model"]
     clip_p = MODELS["clip"]["processor"]
     u_score, m_score = float(probs[0]), float(probs[1])
     if u_score < 0.35:
         verdict = "Perspective Divergence: Intent not grounded in image."
     elif abs(u_score - m_score) < 0.15: