Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Sleeping

App Files Files Community

SaniaE commited on 4 days ago

Commit

6d073fb

verified ·

1 Parent(s): d0c7c50

testing further optimizations

Browse files

Files changed (1) hide show

app.py +43 -30

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ from transformers import (
     CLIPModel, CLIPProcessor
 )
-app = FastAPI(title="Optimized Dual-Ensemble XAI Auditor Backend")
 app.add_middleware(
     CORSMiddleware,
@@ -32,7 +32,6 @@ app.add_middleware(
 )
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Use float16 on GPU to slice memory overhead in half; fallback to float32 on CPU
 DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
 MODELS = {}
@@ -42,50 +41,54 @@ async def startup_event():
     token = os.getenv("HF_Token")
     if token: login(token=token)
-    print("Syncing ensemble weights...")
     local_dir = snapshot_download(repo_id="SaniaE/Image_Captioning_Ensemble", token=token, local_dir="weights")
-    # 1. Load BLIP-Large (.half() cuts VRAM allocation in half)
     blip_model = BlipForConditionalGeneration.from_pretrained(os.path.join(local_dir, "blip"))
     MODELS["blip"] = {
-        "model": blip_model.to(device=DEVICE, dtype=DTYPE),
         "processor": BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
     }
-    # 2. Load ViT / Descriptive Language Model Track
-    # Point this to your new fine-tuned folder/repo path once your retraining runs are complete
     vit_model = AutoModelForCausalLM.from_pretrained(os.path.join(local_dir, "vit"))
     MODELS["vit"] = {
-        "model": vit_model.to(device=DEVICE, dtype=DTYPE),
         "processor": (
             ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning"),
             AutoProcessor.from_pretrained("microsoft/git-large")
         )
     }
-    # 3. Load CLIP Jury
     clip_model = CLIPModel.from_pretrained(os.path.join(local_dir, "clip/clip_model"))
     MODELS["clip"] = {
         "model": clip_model.to(device=DEVICE, dtype=DTYPE),
         "processor": CLIPProcessor.from_pretrained(os.path.join(local_dir, "clip/clip_processor"))
     }
-    print("Dual-ensemble successfully loaded in optimized low-precision layout.")
-# --- Optimized Core Utility ---
 def _generate_batched_ensemble(selection, image, temp, top_k, top_p, max_len=20):
-    """高效的双模型批处理生成引擎 (Optimized Dual-Model Batch Generation Engine)"""
     counts = {arch: selection.count(arch) for arch in ["blip", "vit"]}
     results_map = {"blip": [], "vit": []}
     with torch.inference_mode():
-        # ---- 1. Optimized BLIP Pass ----
         if counts["blip"] > 0:
             b_data = MODELS["blip"]
             inputs = b_data["processor"](images=image, return_tensors="pt")
-            # Ensure pixel tensors match our low-precision runtime configuration
             pixel_values = inputs.pixel_values.to(device=DEVICE, dtype=DTYPE)
             batched_pixels = pixel_values.repeat(counts["blip"], 1, 1, 1)
@@ -101,12 +104,20 @@ def _generate_batched_ensemble(selection, image, temp, top_k, top_p, max_len=20)
             decoded = b_data["processor"].batch_decode(ids, skip_special_tokens=True)
             results_map["blip"] = [cap.strip() for cap in decoded]
-        # ---- 2. Optimized ViT/GIT Pass ----
         if counts["vit"] > 0:
             v_data = MODELS["vit"]
-            i_proc, t_proc = v_data["processor"]
             inputs = i_proc(images=image, return_tensors="pt")
             pixel_values = inputs.pixel_values.to(device=DEVICE, dtype=DTYPE)
             batched_pixels = pixel_values.repeat(counts["vit"], 1, 1, 1)
@@ -129,8 +140,13 @@ def _generate_batched_ensemble(selection, image, temp, top_k, top_p, max_len=20)
             decoded = t_proc.batch_decode(ids, skip_special_tokens=True)
             results_map["vit"] = [cap.strip() for cap in decoded]
-    # Map outputs back to the original order requested by your random draw selection
     final_captions = []
     blip_idx, vit_idx = 0, 0
     for arch in selection:
@@ -152,7 +168,7 @@ async def generate_captions(
     top_k: int = Query(40),
     top_p: float = Query(0.9)
 ):
-    """Generates 5 diverse captions across an explicit multi-model selection field."""
     start_time = time.perf_counter()
     image = Image.open(file.file).convert("RGB")
@@ -163,9 +179,6 @@ async def generate_captions(
         _generate_batched_ensemble, selection, image, temp, top_k, top_p, 20
     )
-    if DEVICE == "cuda": torch.cuda.empty_cache()
-    gc.collect()
     elapsed_time = time.perf_counter() - start_time
     print(f"[BENCHMARK] /generate dual-ensemble turnaround: {elapsed_time:.4f}s")
@@ -185,6 +198,7 @@ async def get_vision_saliency(file: UploadFile = File(...)):
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
     inputs = blip["processor"](images=orig_img, return_tensors="pt")
     pixel_values = inputs.pixel_values.to(device=DEVICE, dtype=DTYPE)
@@ -195,6 +209,12 @@ async def get_vision_saliency(file: UploadFile = File(...)):
         grid_size = int(np.sqrt(mask_1d.shape[-1]))
         mask = mask_1d.view(grid_size, grid_size).cpu().numpy()
     mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
     w, h = orig_img.size
     mask_resized = cv2.resize(mask, (w, h), interpolation=cv2.INTER_CUBIC)
@@ -211,9 +231,6 @@ async def get_vision_saliency(file: UploadFile = File(...)):
     blended_img.save(buf, format="PNG")
     buf.seek(0)
-    if DEVICE == "cuda": torch.cuda.empty_cache()
-    gc.collect()
     return StreamingResponse(buf, media_type="image/png")
 @app.post("/audit")
@@ -223,12 +240,12 @@ async def internal_debate_audit(file: UploadFile = File(...), user_prompt: str =
     image_bytes = await file.read()
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-    # 1. Deterministic Base Prediction Pass
     blip_caption = (await asyncio.to_thread(
         _generate_batched_ensemble, ["blip"], image, 1.0, 1, 1.0, 20
     ))[0]
-    # 2. Low-Precision Decoupled CLIP Scoring Matrix
     clip_m = MODELS["clip"]["model"]
     clip_p = MODELS["clip"]["processor"]
@@ -236,7 +253,6 @@ async def internal_debate_audit(file: UploadFile = File(...), user_prompt: str =
     text_inputs = clip_p(text=[user_prompt, blip_caption], return_tensors="pt", padding=True)
     with torch.inference_mode():
-        # Move inputs to device and cast features dynamically
         img_pixels = image_inputs.pixel_values.to(device=DEVICE, dtype=DTYPE)
         txt_ids = text_inputs.input_ids.to(DEVICE)
         txt_mask = text_inputs.attention_mask.to(DEVICE)
@@ -254,9 +270,6 @@ async def internal_debate_audit(file: UploadFile = File(...), user_prompt: str =
     verdict = "Model Bias Detected." if abs(u_score - m_score) >= 0.15 else "Consensus: High Alignment."
     if u_score < 0.35: verdict = "Perspective Divergence: Intent not grounded in image."
-    if DEVICE == "cuda": torch.cuda.empty_cache()
-    gc.collect()
     return {
         "perspectives": {"user": user_prompt, "ai": blip_caption},
         "audit_scores": {"intent_grounding": round(u_score, 4), "ai_grounding": round(m_score, 4)},

     CLIPModel, CLIPProcessor
 )
+app = FastAPI(title="XAI Auditor: Hot-Swapping Dual Ensemble")
 app.add_middleware(
     CORSMiddleware,
 )
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
 MODELS = {}
     token = os.getenv("HF_Token")
     if token: login(token=token)
+    print("Syncing dual-ensemble weights from repository...")
     local_dir = snapshot_download(repo_id="SaniaE/Image_Captioning_Ensemble", token=token, local_dir="weights")
+    # 1. Initialize BLIP-Large on CPU
     blip_model = BlipForConditionalGeneration.from_pretrained(os.path.join(local_dir, "blip"))
     MODELS["blip"] = {
+        "model": blip_model.to(dtype=DTYPE),  # Kept on CPU until called
         "processor": BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
     }
+    # 2. Initialize ViT/GIT Tracker on CPU
     vit_model = AutoModelForCausalLM.from_pretrained(os.path.join(local_dir, "vit"))
     MODELS["vit"] = {
+        "model": vit_model.to(dtype=DTYPE),   # Kept on CPU until called
         "processor": (
             ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning"),
             AutoProcessor.from_pretrained("microsoft/git-large")
         )
     }
+    # 3. Load Fine-Tuned CLIP Jury onto active hardware (Crucial for fast parallel scoring)
     clip_model = CLIPModel.from_pretrained(os.path.join(local_dir, "clip/clip_model"))
     MODELS["clip"] = {
         "model": clip_model.to(device=DEVICE, dtype=DTYPE),
         "processor": CLIPProcessor.from_pretrained(os.path.join(local_dir, "clip/clip_processor"))
     }
+    print("Ensemble pipeline initialized with CPU-backed hot-swapping optimization.")
+# --- Hot-Swapping Core Logic ---
 def _generate_batched_ensemble(selection, image, temp, top_k, top_p, max_len=20):
+    """
+    Executes inference by isolating model execution windows to prevent VRAM thrashing.
+    """
     counts = {arch: selection.count(arch) for arch in ["blip", "vit"]}
     results_map = {"blip": [], "vit": []}
     with torch.inference_mode():
+        # ---- 1. Isolated BLIP Window ----
         if counts["blip"] > 0:
             b_data = MODELS["blip"]
+            # Hot-load weights directly onto active device
+            b_data["model"].to(DEVICE)
             inputs = b_data["processor"](images=image, return_tensors="pt")
             pixel_values = inputs.pixel_values.to(device=DEVICE, dtype=DTYPE)
             batched_pixels = pixel_values.repeat(counts["blip"], 1, 1, 1)
             decoded = b_data["processor"].batch_decode(ids, skip_special_tokens=True)
             results_map["blip"] = [cap.strip() for cap in decoded]
+            # Evict model back to system storage space
+            b_data["model"].to("cpu")
+            if DEVICE == "cuda": torch.cuda.empty_cache()
+            gc.collect()
+        # ---- 2. Isolated ViT Window ----
         if counts["vit"] > 0:
             v_data = MODELS["vit"]
+            # Hot-load model weights now that BLIP has completely cleared out
+            v_data["model"].to(DEVICE)
+            i_proc, t_proc = v_data["processor"]
             inputs = i_proc(images=image, return_tensors="pt")
             pixel_values = inputs.pixel_values.to(device=DEVICE, dtype=DTYPE)
             batched_pixels = pixel_values.repeat(counts["vit"], 1, 1, 1)
             decoded = t_proc.batch_decode(ids, skip_special_tokens=True)
             results_map["vit"] = [cap.strip() for cap in decoded]
+            # Clear device footprints immediately
+            v_data["model"].to("cpu")
+            if DEVICE == "cuda": torch.cuda.empty_cache()
+            gc.collect()
+    # Align predictions back to original random generation array
     final_captions = []
     blip_idx, vit_idx = 0, 0
     for arch in selection:
     top_k: int = Query(40),
     top_p: float = Query(0.9)
 ):
+    """Generates 5 diverse captions via a hot-swapping tensor batching routine."""
     start_time = time.perf_counter()
     image = Image.open(file.file).convert("RGB")
         _generate_batched_ensemble, selection, image, temp, top_k, top_p, 20
     )
     elapsed_time = time.perf_counter() - start_time
     print(f"[BENCHMARK] /generate dual-ensemble turnaround: {elapsed_time:.4f}s")
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
+    blip["model"].to(DEVICE)  # Bring up to map attentions
     inputs = blip["processor"](images=orig_img, return_tensors="pt")
     pixel_values = inputs.pixel_values.to(device=DEVICE, dtype=DTYPE)
         grid_size = int(np.sqrt(mask_1d.shape[-1]))
         mask = mask_1d.view(grid_size, grid_size).cpu().numpy()
+    # Offload right after extraction
+    blip["model"].to("cpu")
+    if DEVICE == "cuda": torch.cuda.empty_cache()
+    gc.collect()
+    # Native OpenCV Heatmap Generation Matrix
     mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
     w, h = orig_img.size
     mask_resized = cv2.resize(mask, (w, h), interpolation=cv2.INTER_CUBIC)
     blended_img.save(buf, format="PNG")
     buf.seek(0)
     return StreamingResponse(buf, media_type="image/png")
 @app.post("/audit")
     image_bytes = await file.read()
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    # 1. Get deterministic baseline prediction string
     blip_caption = (await asyncio.to_thread(
         _generate_batched_ensemble, ["blip"], image, 1.0, 1, 1.0, 20
     ))[0]
+    # 2. Match Embeddings (CLIP stays pinned to target hardware device)
     clip_m = MODELS["clip"]["model"]
     clip_p = MODELS["clip"]["processor"]
     text_inputs = clip_p(text=[user_prompt, blip_caption], return_tensors="pt", padding=True)
     with torch.inference_mode():
         img_pixels = image_inputs.pixel_values.to(device=DEVICE, dtype=DTYPE)
         txt_ids = text_inputs.input_ids.to(DEVICE)
         txt_mask = text_inputs.attention_mask.to(DEVICE)
     verdict = "Model Bias Detected." if abs(u_score - m_score) >= 0.15 else "Consensus: High Alignment."
     if u_score < 0.35: verdict = "Perspective Divergence: Intent not grounded in image."
     return {
         "perspectives": {"user": user_prompt, "ai": blip_caption},
         "audit_scores": {"intent_grounding": round(u_score, 4), "ai_grounding": round(m_score, 4)},