import torch from transformers import CLIPProcessor, CLIPModel from datasets import load_dataset from PIL import Image import faiss import gradio as gr # ------------------- # Load CLIP model # ------------------- device = "cuda" if torch.cuda.is_available() else "cpu" model_name = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(model_name).to(device) processor = CLIPProcessor.from_pretrained(model_name) # ------------------- # Load Flickr8k subset # ------------------- dataset = load_dataset("Naveengo/flickr8k", split="train[:1000]") # small subset captions = [item["text"] for item in dataset] images = [item["image"] for item in dataset] # ------------------- # Encode captions (for Image→Text) # ------------------- batch_size = 32 text_embeds_list = [] for i in range(0, len(captions), batch_size): batch = captions[i:i+batch_size] with torch.no_grad(): txt_inputs = processor(text=batch, return_tensors="pt", padding=True).to(device) txt_emb = model.get_text_features(**txt_inputs) txt_emb = txt_emb / txt_emb.norm(p=2, dim=-1, keepdim=True) text_embeds_list.append(txt_emb.cpu()) text_embeds = torch.cat(text_embeds_list, dim=0).numpy() index_text = faiss.IndexFlatIP(text_embeds.shape[1]) index_text.add(text_embeds) # ------------------- # Encode images (for Text→Image) # ------------------- img_embeds_list = [] for i in range(0, len(images), batch_size): batch = images[i:i+batch_size] with torch.no_grad(): img_inputs = processor(images=batch, return_tensors="pt").to(device) img_emb = model.get_image_features(**img_inputs) img_emb = img_emb / img_emb.norm(p=2, dim=-1, keepdim=True) img_embeds_list.append(img_emb.cpu()) img_embeds = torch.cat(img_embeds_list, dim=0).numpy() index_img = faiss.IndexFlatIP(img_embeds.shape[1]) index_img.add(img_embeds) # ------------------- # Retrieval functions # ------------------- def retrieve_captions(image, top_k=5): img_inputs = processor(images=image, return_tensors="pt").to(device) with torch.no_grad(): img_emb = model.get_image_features(**img_inputs) img_emb = img_emb / img_emb.norm(p=2, dim=-1, keepdim=True) img_emb = img_emb.cpu().numpy() sims, idxs = index_text.search(img_emb, top_k) results = {captions[i]: float(s) for i, s in zip(idxs[0], sims[0])} return results def retrieve_images(text, top_k=3): txt_inputs = processor(text=[text], return_tensors="pt").to(device) with torch.no_grad(): txt_emb = model.get_text_features(**txt_inputs) txt_emb = txt_emb / txt_emb.norm(p=2, dim=-1, keepdim=True) txt_emb = txt_emb.cpu().numpy() sims, idxs = index_img.search(txt_emb, top_k) results = {f"Image {i} (caption: {captions[i][:40]}...)": float(s) for i, s in zip(idxs[0], sims[0])} return results # ------------------- # Gradio UI (two tabs) # ------------------- with gr.Blocks() as demo: gr.Markdown("## 🔀 CLIP Multimodal Retrieval (FAISS + Flickr8k)\nUpload an image or enter text to retrieve matches from the dataset.") with gr.Tab("Image → Captions"): img_input = gr.Image(type="pil") img_output = gr.Label(num_top_classes=5) btn1 = gr.Button("Find Captions") btn1.click(fn=retrieve_captions, inputs=img_input, outputs=img_output) with gr.Tab("Text → Images"): txt_input = gr.Textbox(label="Enter a caption") txt_output = gr.Label(num_top_classes=3) btn2 = gr.Button("Find Images") btn2.click(fn=retrieve_images, inputs=txt_input, outputs=txt_output) if __name__ == "__main__": demo.launch()