Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Sleeping

App Files Files Community

SaniaE commited on 28 days ago

Commit

50e497f

verified ·

1 Parent(s): 9fef689

optimized code

Browse files

Files changed (1) hide show

app.py +35 -69

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
-import asyncio
 import torch
 import random
 from PIL import Image
 from fastapi import FastAPI, UploadFile, File, Query
 from fastapi.middleware.cors import CORSMiddleware
@@ -10,7 +10,6 @@ from transformers import (
     BlipProcessor, BlipForConditionalGeneration,
     ViTImageProcessor, AutoProcessor, AutoModelForCausalLM
 )
-from sentence_transformers import SentenceTransformer, util
 app = FastAPI()
@@ -18,9 +17,8 @@ app = FastAPI()
 REPO_ID = "SaniaE/Image_Captioning_Ensemble"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODELS = {}
-SEARCH_MODEL = None
-# We'll map your local folder names to the specific config
 MODEL_SETTINGS = {
     "blip": {
         "subfolder": "blip",
@@ -33,72 +31,54 @@ MODEL_SETTINGS = {
         "processor": [ViTImageProcessor, AutoProcessor],
         "pretrained_path": ["nlpconnect/vit-gpt2-image-captioning", "microsoft/git-large"],
         "inference_model": AutoModelForCausalLM
-    },
-    "git": {
-        "subfolder": "git",
-        "processor": AutoProcessor,
-        "pretrained_path": "microsoft/git-base",
-        "inference_model": AutoModelForCausalLM
     }
 }
 @app.on_event("startup")
 async def startup_event():
-    global MODELS, SEARCH_MODEL
-    # 1. Authenticate and Download from Private Repo
-    token = os.getenv("HF_Token")
-    if token:
-        login(token=token)
-    print(f"Downloading ensemble models from {REPO_ID}...")
-    # This downloads the whole repo into a local 'weights' directory
     local_dir = snapshot_download(repo_id=REPO_ID, token=token, local_dir="weights")
-    # 2. Load Models from the downloaded folders
     for name, cfg in MODEL_SETTINGS.items():
         ckpt_path = os.path.join(local_dir, cfg["subfolder"])
-        inf_model = cfg["inference_model"]
-        pretrained = cfg["pretrained_path"]
-        proc_class = cfg["processor"]
         print(f"Loading {name} from {ckpt_path}...")
-        # from_pretrained handles .safetensors automatically
-        model = inf_model.from_pretrained(ckpt_path).to(DEVICE)
         if name == "vit":
-            i_proc = proc_class[0].from_pretrained(pretrained[0])
-            t_proc = proc_class[1].from_pretrained(pretrained[1])
             processor = (i_proc, t_proc)
         else:
-            processor = proc_class.from_pretrained(pretrained)
         MODELS[name] = {"model": model, "processor": processor}
-    SEARCH_MODEL = SentenceTransformer('clip-ViT-B-32')
-    print("Ensemble is live!")
-async def run_inference(m_name, image, temp, top_k, top_p):
-    # This runs in a separate thread to avoid blocking the event loop
-    return await asyncio.to_thread(_generate_sync, m_name, image, temp, top_k, top_p)
 def _generate_sync(m_name, image, temp, top_k, top_p):
     m_data = MODELS[m_name]
     model = m_data["model"]
     if m_name == "vit":
         i_proc, t_proc = m_data["processor"]
-        pixel_values = i_proc(images=image, return_tensors="pt").pixel_values.to(DEVICE)
         gen_ids = model.generate(
-            pixel_values=pixel_values, max_length=300, do_sample=True,
             temperature=temp, top_k=top_k, top_p=top_p
         )
         return t_proc.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
     else:
         proc = m_data["processor"]
-        pixel_values = proc(images=image, return_tensors="pt").pixel_values.to(DEVICE)
         gen_ids = model.generate(
-            pixel_values=pixel_values, max_length=300, do_sample=True,
             temperature=temp, top_k=top_k, top_p=top_p
         )
         return proc.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
@@ -111,47 +91,33 @@ async def generate_endpoint(
     top_p: float = Query(0.9)
 ):
     image = Image.open(file.file).convert("RGB")
-    available = list(MODELS.keys())
     model_selection = random.choices(available, k=5)
-    # Create tasks for parallel execution
-    tasks = [run_inference(m, image, temp, top_k, top_p) for m in model_selection]
     captions = await asyncio.gather(*tasks)
     return {"captions": captions, "mix": model_selection}
 @app.post("/ui-tester")
 async def ui_tester(file: UploadFile = File(...), description: str = Query(...)):
-    """Matches a user description against an image using CLIP embeddings."""
     image = Image.open(file.file).convert("RGB")
-    img_emb = SEARCH_MODEL.encode(image)
-    txt_emb = SEARCH_MODEL.encode(description)
-    # Calculate cosine similarity
-    score = util.cos_sim(img_emb, txt_emb).item()
     return {
         "match_score": round(score, 4),
-        "is_match": score > 0.25, # Threshold can be adjusted
-        "status": "High correlation" if score > 0.3 else "Low correlation"
-    }
-@app.get("/ui-search")
-async def ui_search(description: str = Query(...)):
-    """Returns top image matches from a gallery based on a text description."""
-    if not IMAGE_GALLERY_EMBEDDINGS:
-        return {"error": "Gallery not initialized"}
-    query_emb = SEARCH_MODEL.encode(description)
-    hits = util.semantic_search(query_emb, IMAGE_GALLERY_EMBEDDINGS, top_k=3)
-    results = []
-    for hit in hits[0]:
-        results.append({
-            "image_path": IMAGE_PATHS[hit['corpus_id']],
-            "score": round(hit['score'], 4)
-        })
-    return {"results": results}

 import os
 import torch
 import random
+import asyncio
 from PIL import Image
 from fastapi import FastAPI, UploadFile, File, Query
 from fastapi.middleware.cors import CORSMiddleware
     BlipProcessor, BlipForConditionalGeneration,
     ViTImageProcessor, AutoProcessor, AutoModelForCausalLM
 )
 app = FastAPI()
 REPO_ID = "SaniaE/Image_Captioning_Ensemble"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODELS = {}
+# Removed GIT, kept BLIP and ViT
 MODEL_SETTINGS = {
     "blip": {
         "subfolder": "blip",
         "processor": [ViTImageProcessor, AutoProcessor],
         "pretrained_path": ["nlpconnect/vit-gpt2-image-captioning", "microsoft/git-large"],
         "inference_model": AutoModelForCausalLM
     }
 }
 @app.on_event("startup")
 async def startup_event():
+    global MODELS
+    token = os.getenv("HF_TOKEN")
+    if token: login(token=token)
+    print(f"Downloading models from {REPO_ID}...")
     local_dir = snapshot_download(repo_id=REPO_ID, token=token, local_dir="weights")
     for name, cfg in MODEL_SETTINGS.items():
         ckpt_path = os.path.join(local_dir, cfg["subfolder"])
         print(f"Loading {name} from {ckpt_path}...")
+        # Load Model
+        model = cfg["inference_model"].from_pretrained(ckpt_path).to(DEVICE)
+        # Load Processor
         if name == "vit":
+            i_proc = cfg["processor"][0].from_pretrained(cfg["pretrained_path"][0])
+            t_proc = cfg["processor"][1].from_pretrained(cfg["pretrained_path"][1])
             processor = (i_proc, t_proc)
         else:
+            processor = cfg["processor"].from_pretrained(cfg["pretrained_path"])
         MODELS[name] = {"model": model, "processor": processor}
+    print("Optimization Complete: GIT and Search removed. Ensemble is live!")
+# --- Helper for Parallel Inference ---
 def _generate_sync(m_name, image, temp, top_k, top_p):
     m_data = MODELS[m_name]
     model = m_data["model"]
     if m_name == "vit":
         i_proc, t_proc = m_data["processor"]
+        inputs = i_proc(images=image, return_tensors="pt").to(DEVICE)
         gen_ids = model.generate(
+            **inputs, max_length=300, do_sample=True,
             temperature=temp, top_k=top_k, top_p=top_p
         )
         return t_proc.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
     else:
         proc = m_data["processor"]
+        inputs = proc(images=image, return_tensors="pt").to(DEVICE)
         gen_ids = model.generate(
+            **inputs, max_length=300, do_sample=True,
             temperature=temp, top_k=top_k, top_p=top_p
         )
         return proc.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
     top_p: float = Query(0.9)
 ):
     image = Image.open(file.file).convert("RGB")
+    available = list(MODELS.keys()) # Only blip and vit
+    # Create 5 slots from the 2 remaining models
     model_selection = random.choices(available, k=5)
+    tasks = [asyncio.to_thread(_generate_sync, m, image, temp, top_k, top_p) for m in model_selection]
     captions = await asyncio.gather(*tasks)
     return {"captions": captions, "mix": model_selection}
 @app.post("/ui-tester")
 async def ui_tester(file: UploadFile = File(...), description: str = Query(...)):
+    """Uses BLIP's native capability to score the match between image and text."""
     image = Image.open(file.file).convert("RGB")
+    blip_data = MODELS["blip"]
+    # We use the processor to prepare both image and text for the model
+    inputs = blip_data["processor"](images=image, text=description, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        # BLIP models have a built-in vision/text matching logic
+        # For simple captioning models, we can use the model's loss or log-likelihood
+        outputs = blip_data["model"](**inputs, labels=inputs["input_ids"])
+        # We convert the loss to a pseudo-similarity score (lower loss = higher match)
+        loss = outputs.loss.item()
+        score = 1 / (1 + loss) # Normalized 0 to 1
     return {
         "match_score": round(score, 4),
+        "status": "High match" if score > 0.4 else "Low match"
+    }