Spaces:

tooba248
/

Bidirectional-Retrieval-Model

Runtime error

App Files Files Community

tooba248 commited on May 30, 2025

Commit

e403cae

verified ·

1 Parent(s): 444f65d

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -62

app.py CHANGED Viewed

@@ -1,99 +1,96 @@
 import gradio as gr
 import torch
 import clip
 from PIL import Image
 import faiss
 import requests
-from datasets import load_dataset
 from io import BytesIO
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load base CLIP model and preprocess
 model_clip, preprocess = clip.load("ViT-B/32", device=device)
-# Load fine-tuned weights (state_dict) and apply to CLIP model
 state_dict = torch.load("best_model.pt", map_location=device)
-model_clip.load_state_dict(state_dict)
 model_clip.eval()
-# Load Flickr30k test split dataset
 dataset = load_dataset("nlphuji/flickr30k", split="test")
-captions = []
-images = []
-image_embeddings = []
-text_embeddings = []
-print("Preparing embeddings for retrieval pool...")
 for example in dataset:
     try:
-        # Load image from URL
         img = Image.open(requests.get(example["image"], stream=True).raw).convert("RGB")
         images.append(img)
         captions.append(example["sentence"])
-        # Preprocess and encode image
-        img_tensor = preprocess(img).unsqueeze(0).to(device)
         with torch.no_grad():
-            img_feat = model_clip.encode_image(img_tensor)
-            img_feat /= img_feat.norm(dim=-1, keepdim=True)
-            image_embeddings.append(img_feat.cpu())
-            # Tokenize and encode text
-            txt_token = clip.tokenize([example["sentence"]]).to(device)
-            txt_feat = model_clip.encode_text(txt_token)
-            txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
-            text_embeddings.append(txt_feat.cpu())
-    except Exception as e:
-        print(f"Skipping one example due to error: {e}")
         continue
-# Convert lists of embeddings to tensors
-image_embeddings = torch.cat(image_embeddings, dim=0)
-text_embeddings = torch.cat(text_embeddings, dim=0)
-# Create FAISS indices for fast similarity search (Inner Product = cosine similarity)
-image_index = faiss.IndexFlatIP(image_embeddings.shape[1])
-image_index.add(image_embeddings.numpy())
-text_index = faiss.IndexFlatIP(text_embeddings.shape[1])
-text_index.add(text_embeddings.numpy())
-def image_to_text(image):
-    image_input = preprocess(image).unsqueeze(0).to(device)
     with torch.no_grad():
-        image_feature = model_clip.encode_image(image_input)
-        image_feature /= image_feature.norm(dim=-1, keepdim=True)
-    D, I = text_index.search(image_feature.cpu().numpy(), 1)
-    score = round(float(D[0][0]) * 100, 2)
-    return f"{captions[I[0][0]]}\n(Match Score: {score}%)"
-def text_to_image(text):
-    text_input = clip.tokenize([text]).to(device)
     with torch.no_grad():
-        text_feature = model_clip.encode_text(text_input)
-        text_feature /= text_feature.norm(dim=-1, keepdim=True)
-    D, I = image_index.search(text_feature.cpu().numpy(), 1)
-    score = round(float(D[0][0]) * 100, 2)
-    img = images[I[0][0]]
-    return img, f"Match Score: {score}%"
 with gr.Blocks() as demo:
-    gr.Markdown("## 🔄 Cross-Modal Retriever on Flickr30k (Image ↔ Text Matching)")
-    with gr.Tab("🖼️ Image to Text"):
-        img_input = gr.Image(type="pil", label="Upload Image")
-        text_output = gr.Textbox(label="Most Similar Caption")
-        btn1 = gr.Button("Find Caption")
-        btn1.click(image_to_text, inputs=img_input, outputs=text_output)
-    with gr.Tab("📝 Text to Image"):
-        text_input = gr.Textbox(label="Enter a Caption")
-        img_output = gr.Image(label="Most Similar Image")
-        score_output = gr.Textbox(label="Similarity Score")
-        btn2 = gr.Button("Find Image")
-        btn2.click(text_to_image, inputs=text_input, outputs=[img_output, score_output])
 demo.launch()

 import gradio as gr
 import torch
 import clip
+from datasets import load_dataset
 from PIL import Image
 import faiss
 import requests
 from io import BytesIO
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# 1) Load base CLIP model + preprocess
 model_clip, preprocess = clip.load("ViT-B/32", device=device)
+# 2) Load your fine‐tuned weights (state_dict) into model_clip
 state_dict = torch.load("best_model.pt", map_location=device)
+missing, unexpected = model_clip.load_state_dict(state_dict, strict=False)
+print(f"⚠️  Missing keys: {missing}\n⚠️  Unexpected keys: {unexpected}")
 model_clip.eval()
+# 3) Build retrieval pool from Flickr30k test split
 dataset = load_dataset("nlphuji/flickr30k", split="test")
+images, captions = [], []
+img_embs, txt_embs = [], []
+print("🔄 Preparing retrieval pool embeddings...")
 for example in dataset:
     try:
+        # load & store raw image + caption
         img = Image.open(requests.get(example["image"], stream=True).raw).convert("RGB")
         images.append(img)
         captions.append(example["sentence"])
+        # encode image
+        img_t = preprocess(img).unsqueeze(0).to(device)
         with torch.no_grad():
+            v = model_clip.encode_image(img_t)
+            v /= v.norm(dim=-1, keepdim=True)
+        img_embs.append(v.cpu())
+        # encode text
+        t = clip.tokenize([example["sentence"]]).to(device)
+        with torch.no_grad():
+            tfeat = model_clip.encode_text(t)
+            tfeat /= tfeat.norm(dim=-1, keepdim=True)
+        txt_embs.append(tfeat.cpu())
+    except:
         continue
+# cat into tensors
+img_embs = torch.cat(img_embs, dim=0)
+txt_embs = torch.cat(txt_embs, dim=0)
+# build FAISS indices (Inner‐Product = cosine)
+img_index = faiss.IndexFlatIP(img_embs.shape[1])
+img_index.add(img_embs.numpy())
+txt_index = faiss.IndexFlatIP(txt_embs.shape[1])
+txt_index.add(txt_embs.numpy())
+# 4) Gradio callbacks
+def image_to_text(inp_img):
+    im = preprocess(inp_img).unsqueeze(0).to(device)
     with torch.no_grad():
+        v = model_clip.encode_image(im)
+        v /= v.norm(dim=-1, keepdim=True)
+    D, I = txt_index.search(v.cpu().numpy(), 1)
+    score = D[0][0] * 100
+    return f"{captions[I[0][0]]}\n(Match Score: {score:.2f}%)"
+def text_to_image(inp_txt):
+    tok = clip.tokenize([inp_txt]).to(device)
     with torch.no_grad():
+        t = model_clip.encode_text(tok)
+        t /= t.norm(dim=-1, keepdim=True)
+    D, I = img_index.search(t.cpu().numpy(), 1)
+    score = D[0][0] * 100
+    return images[I[0][0]], f"Match Score: {score:.2f}%"
+# 5) Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("## 🔄 Cross-Modal Retriever (Flickr30k Test Split)\nUpload an image or enter text to retrieve the best match.")
+    with gr.Tab("🖼️ Image → Text"):
+        img_in = gr.Image(type="pil", label="Upload Image")
+        txt_out = gr.Textbox(label="Retrieved Caption")
+        gr.Button("Search Caption").click(image_to_text, img_in, txt_out)
+    with gr.Tab("📝 Text → Image"):
+        txt_in = gr.Textbox(label="Enter Text")
+        img_out = gr.Image(label="Retrieved Image")
+        score_out = gr.Textbox(label="Score")
+        gr.Button("Search Image").click(text_to_image, txt_in, [img_out, score_out])
 demo.launch()