soh7's picture
Update app.py
260d9a9 verified
import torch
from transformers import CLIPProcessor, CLIPModel
from datasets import load_dataset
from PIL import Image
import faiss
import gradio as gr
# -------------------
# Load CLIP model
# -------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)
# -------------------
# Load Flickr8k subset
# -------------------
dataset = load_dataset("Naveengo/flickr8k", split="train[:1000]") # small subset
captions = [item["text"] for item in dataset]
images = [item["image"] for item in dataset]
# -------------------
# Encode captions (for Image→Text)
# -------------------
batch_size = 32
text_embeds_list = []
for i in range(0, len(captions), batch_size):
batch = captions[i:i+batch_size]
with torch.no_grad():
txt_inputs = processor(text=batch, return_tensors="pt", padding=True).to(device)
txt_emb = model.get_text_features(**txt_inputs)
txt_emb = txt_emb / txt_emb.norm(p=2, dim=-1, keepdim=True)
text_embeds_list.append(txt_emb.cpu())
text_embeds = torch.cat(text_embeds_list, dim=0).numpy()
index_text = faiss.IndexFlatIP(text_embeds.shape[1])
index_text.add(text_embeds)
# -------------------
# Encode images (for Text→Image)
# -------------------
img_embeds_list = []
for i in range(0, len(images), batch_size):
batch = images[i:i+batch_size]
with torch.no_grad():
img_inputs = processor(images=batch, return_tensors="pt").to(device)
img_emb = model.get_image_features(**img_inputs)
img_emb = img_emb / img_emb.norm(p=2, dim=-1, keepdim=True)
img_embeds_list.append(img_emb.cpu())
img_embeds = torch.cat(img_embeds_list, dim=0).numpy()
index_img = faiss.IndexFlatIP(img_embeds.shape[1])
index_img.add(img_embeds)
# -------------------
# Retrieval functions
# -------------------
def retrieve_captions(image, top_k=5):
img_inputs = processor(images=image, return_tensors="pt").to(device)
with torch.no_grad():
img_emb = model.get_image_features(**img_inputs)
img_emb = img_emb / img_emb.norm(p=2, dim=-1, keepdim=True)
img_emb = img_emb.cpu().numpy()
sims, idxs = index_text.search(img_emb, top_k)
results = {captions[i]: float(s) for i, s in zip(idxs[0], sims[0])}
return results
def retrieve_images(text, top_k=3):
txt_inputs = processor(text=[text], return_tensors="pt").to(device)
with torch.no_grad():
txt_emb = model.get_text_features(**txt_inputs)
txt_emb = txt_emb / txt_emb.norm(p=2, dim=-1, keepdim=True)
txt_emb = txt_emb.cpu().numpy()
sims, idxs = index_img.search(txt_emb, top_k)
results = {f"Image {i} (caption: {captions[i][:40]}...)": float(s) for i, s in zip(idxs[0], sims[0])}
return results
# -------------------
# Gradio UI (two tabs)
# -------------------
with gr.Blocks() as demo:
gr.Markdown("## 🔀 CLIP Multimodal Retrieval (FAISS + Flickr8k)\nUpload an image or enter text to retrieve matches from the dataset.")
with gr.Tab("Image → Captions"):
img_input = gr.Image(type="pil")
img_output = gr.Label(num_top_classes=5)
btn1 = gr.Button("Find Captions")
btn1.click(fn=retrieve_captions, inputs=img_input, outputs=img_output)
with gr.Tab("Text → Images"):
txt_input = gr.Textbox(label="Enter a caption")
txt_output = gr.Label(num_top_classes=3)
btn2 = gr.Button("Find Images")
btn2.click(fn=retrieve_images, inputs=txt_input, outputs=txt_output)
if __name__ == "__main__":
demo.launch()