Spaces:

tooba248
/

Bidirectional-Retrieval-Model

Runtime error

App Files Files Community

tooba248 commited on May 30, 2025

Commit

444f65d

verified ·

1 Parent(s): 2698d32

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -15

app.py CHANGED Viewed

@@ -3,19 +3,21 @@ import torch
 import clip
 from PIL import Image
 import faiss
 from datasets import load_dataset
-# Device setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load fine-tuned model (from local file)
-model = torch.load("best_model.pt", map_location=device)
-model.eval()
-# Load base CLIP model for encoding
 model_clip, preprocess = clip.load("ViT-B/32", device=device)
-# Load Flickr30k test dataset
 dataset = load_dataset("nlphuji/flickr30k", split="test")
 captions = []
@@ -23,38 +25,42 @@ images = []
 image_embeddings = []
 text_embeddings = []
-# Prepare image and text embeddings
 for example in dataset:
     try:
         img = Image.open(requests.get(example["image"], stream=True).raw).convert("RGB")
         images.append(img)
         captions.append(example["sentence"])
         img_tensor = preprocess(img).unsqueeze(0).to(device)
         with torch.no_grad():
             img_feat = model_clip.encode_image(img_tensor)
             img_feat /= img_feat.norm(dim=-1, keepdim=True)
             image_embeddings.append(img_feat.cpu())
             txt_token = clip.tokenize([example["sentence"]]).to(device)
             txt_feat = model_clip.encode_text(txt_token)
             txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
             text_embeddings.append(txt_feat.cpu())
-    except:
         continue
-# Convert lists to tensors
 image_embeddings = torch.cat(image_embeddings, dim=0)
 text_embeddings = torch.cat(text_embeddings, dim=0)
-# Build FAISS indices
 image_index = faiss.IndexFlatIP(image_embeddings.shape[1])
 image_index.add(image_embeddings.numpy())
 text_index = faiss.IndexFlatIP(text_embeddings.shape[1])
 text_index.add(text_embeddings.numpy())
-# Image-to-Text search
 def image_to_text(image):
     image_input = preprocess(image).unsqueeze(0).to(device)
     with torch.no_grad():
@@ -64,7 +70,6 @@ def image_to_text(image):
     score = round(float(D[0][0]) * 100, 2)
     return f"{captions[I[0][0]]}\n(Match Score: {score}%)"
-# Text-to-Image search
 def text_to_image(text):
     text_input = clip.tokenize([text]).to(device)
     with torch.no_grad():
@@ -75,10 +80,9 @@ def text_to_image(text):
     img = images[I[0][0]]
     return img, f"Match Score: {score}%"
-# Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown("## 🔄 Cross-Modal Retriever on Flickr30k (Image ↔ Text Matching)")
     with gr.Tab("🖼️ Image to Text"):
         img_input = gr.Image(type="pil", label="Upload Image")
         text_output = gr.Textbox(label="Most Similar Caption")

 import clip
 from PIL import Image
 import faiss
+import requests
 from datasets import load_dataset
+from io import BytesIO
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load base CLIP model and preprocess
 model_clip, preprocess = clip.load("ViT-B/32", device=device)
+# Load fine-tuned weights (state_dict) and apply to CLIP model
+state_dict = torch.load("best_model.pt", map_location=device)
+model_clip.load_state_dict(state_dict)
+model_clip.eval()
+# Load Flickr30k test split dataset
 dataset = load_dataset("nlphuji/flickr30k", split="test")
 captions = []
 image_embeddings = []
 text_embeddings = []
+print("Preparing embeddings for retrieval pool...")
 for example in dataset:
     try:
+        # Load image from URL
         img = Image.open(requests.get(example["image"], stream=True).raw).convert("RGB")
         images.append(img)
         captions.append(example["sentence"])
+        # Preprocess and encode image
         img_tensor = preprocess(img).unsqueeze(0).to(device)
         with torch.no_grad():
             img_feat = model_clip.encode_image(img_tensor)
             img_feat /= img_feat.norm(dim=-1, keepdim=True)
             image_embeddings.append(img_feat.cpu())
+            # Tokenize and encode text
             txt_token = clip.tokenize([example["sentence"]]).to(device)
             txt_feat = model_clip.encode_text(txt_token)
             txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
             text_embeddings.append(txt_feat.cpu())
+    except Exception as e:
+        print(f"Skipping one example due to error: {e}")
         continue
+# Convert lists of embeddings to tensors
 image_embeddings = torch.cat(image_embeddings, dim=0)
 text_embeddings = torch.cat(text_embeddings, dim=0)
+# Create FAISS indices for fast similarity search (Inner Product = cosine similarity)
 image_index = faiss.IndexFlatIP(image_embeddings.shape[1])
 image_index.add(image_embeddings.numpy())
 text_index = faiss.IndexFlatIP(text_embeddings.shape[1])
 text_index.add(text_embeddings.numpy())
 def image_to_text(image):
     image_input = preprocess(image).unsqueeze(0).to(device)
     with torch.no_grad():
     score = round(float(D[0][0]) * 100, 2)
     return f"{captions[I[0][0]]}\n(Match Score: {score}%)"
 def text_to_image(text):
     text_input = clip.tokenize([text]).to(device)
     with torch.no_grad():
     img = images[I[0][0]]
     return img, f"Match Score: {score}%"
 with gr.Blocks() as demo:
     gr.Markdown("## 🔄 Cross-Modal Retriever on Flickr30k (Image ↔ Text Matching)")
     with gr.Tab("🖼️ Image to Text"):
         img_input = gr.Image(type="pil", label="Upload Image")
         text_output = gr.Textbox(label="Most Similar Caption")