Spaces:

tooba248
/

Bidirectional-Retrieval-Model

Runtime error

App Files Files Community

tooba248 commited on May 30, 2025

Commit

10b2979

verified ·

1 Parent(s): e54509f

Create app.py

Browse files

Files changed (1) hide show

app.py +109 -0

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import gradio as gr
+import torch
+import clip
+from PIL import Image
+from torchvision import transforms
+import faiss
+import requests
+from io import BytesIO
+from datasets import load_dataset
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load fine-tuned model
+model_url = "best_model.pt"
+model_bytes = requests.get(model_url).content
+model = torch.load(BytesIO(model_bytes), map_location=device)
+model.eval()
+# Load CLIP for preprocessing
+model_clip, preprocess = clip.load("ViT-B/32", device=device)
+# Load full test split from Flickr30k
+dataset = load_dataset("nlphuji/flickr30k", split="test")
+captions = []
+images = []
+image_embeddings = []
+text_embeddings = []
+for example in dataset:
+    try:
+        img = Image.open(requests.get(example["image"], stream=True).raw).convert("RGB")
+        images.append(img)
+        captions.append(example["sentence"])
+        img_tensor = preprocess(img).unsqueeze(0).to(device)
+        with torch.no_grad():
+            img_feat = model_clip.encode_image(img_tensor)
+            img_feat /= img_feat.norm(dim=-1, keepdim=True)
+            image_embeddings.append(img_feat.cpu())
+            txt_token = clip.tokenize([example["sentence"]]).to(device)
+            txt_feat = model_clip.encode_text(txt_token)
+            txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
+            text_embeddings.append(txt_feat.cpu())
+    except:
+        continue
+image_embeddings = torch.cat(image_embeddings, dim=0)
+text_embeddings = torch.cat(text_embeddings, dim=0)
+# Create FAISS indexes
+image_index = faiss.IndexFlatIP(image_embeddings.shape[1])
+image_index.add(image_embeddings.numpy())
+text_index = faiss.IndexFlatIP(text_embeddings.shape[1])
+text_index.add(text_embeddings.numpy())
+# Define functions
+def image_to_text(image):
+    image_input = preprocess(image).unsqueeze(0).to(device)
+    with torch.no_grad():
+        image_feature = model_clip.encode_image(image_input)
+        image_feature /= image_feature.norm(dim=-1, keepdim=True)
+    D, I = text_index.search(image_feature.cpu().numpy(), 1)
+    score = round(float(D[0][0]) * 100, 2)
+    return f"{captions[I[0][0]]}\n(Match Score: {score}%)"
+    image_input = preprocess(image).unsqueeze(0).to(device)
+    with torch.no_grad():
+        image_feature = model_clip.encode_image(image_input)
+        image_feature /= image_feature.norm(dim=-1, keepdim=True)
+    _, I = text_index.search(image_feature.cpu().numpy(), 1)
+    return captions[I[0][0]]
+def text_to_image(text):
+    text_input = clip.tokenize([text]).to(device)
+    with torch.no_grad():
+        text_feature = model_clip.encode_text(text_input)
+        text_feature /= text_feature.norm(dim=-1, keepdim=True)
+    D, I = image_index.search(text_feature.cpu().numpy(), 1)
+    score = round(float(D[0][0]) * 100, 2)
+    img = images[I[0][0]]
+    return img, f"Match Score: {score}%"
+    text_input = clip.tokenize([text]).to(device)
+    with torch.no_grad():
+        text_feature = model_clip.encode_text(text_input)
+        text_feature /= text_feature.norm(dim=-1, keepdim=True)
+    _, I = image_index.search(text_feature.cpu().numpy(), 1)
+    return images[I[0][0]]
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("## 🖼️📝 Cross-Modal Retriever on Flickr30k Test Split")
+    with gr.Tab("Image to Text"):
+        img_input = gr.Image(type="pil")
+        text_output = gr.Textbox(label="Retrieved Caption")
+        btn1 = gr.Button("Search Caption")
+        btn1.click(image_to_text, inputs=img_input, outputs=text_output)
+    with gr.Tab("Text to Image"):
+        text_input = gr.Textbox(label="Enter Text Prompt")
+        img_output = gr.Image(label="Most Similar Image")
+        score_output = gr.Textbox(label="Similarity Score")
+        btn2 = gr.Button("Search Image")
+        btn2.click(text_to_image, inputs=text_input, outputs=[img_output, score_output])
+demo.launch()