Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import CLIPProcessor, CLIPModel | |
| from datasets import load_dataset | |
| from PIL import Image | |
| import faiss | |
| import gradio as gr | |
| # ------------------- | |
| # Load CLIP model | |
| # ------------------- | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model_name = "openai/clip-vit-base-patch32" | |
| model = CLIPModel.from_pretrained(model_name).to(device) | |
| processor = CLIPProcessor.from_pretrained(model_name) | |
| # ------------------- | |
| # Load Flickr8k subset | |
| # ------------------- | |
| dataset = load_dataset("Naveengo/flickr8k", split="train[:1000]") # small subset | |
| captions = [item["text"] for item in dataset] | |
| images = [item["image"] for item in dataset] | |
| # ------------------- | |
| # Encode captions (for Image→Text) | |
| # ------------------- | |
| batch_size = 32 | |
| text_embeds_list = [] | |
| for i in range(0, len(captions), batch_size): | |
| batch = captions[i:i+batch_size] | |
| with torch.no_grad(): | |
| txt_inputs = processor(text=batch, return_tensors="pt", padding=True).to(device) | |
| txt_emb = model.get_text_features(**txt_inputs) | |
| txt_emb = txt_emb / txt_emb.norm(p=2, dim=-1, keepdim=True) | |
| text_embeds_list.append(txt_emb.cpu()) | |
| text_embeds = torch.cat(text_embeds_list, dim=0).numpy() | |
| index_text = faiss.IndexFlatIP(text_embeds.shape[1]) | |
| index_text.add(text_embeds) | |
| # ------------------- | |
| # Encode images (for Text→Image) | |
| # ------------------- | |
| img_embeds_list = [] | |
| for i in range(0, len(images), batch_size): | |
| batch = images[i:i+batch_size] | |
| with torch.no_grad(): | |
| img_inputs = processor(images=batch, return_tensors="pt").to(device) | |
| img_emb = model.get_image_features(**img_inputs) | |
| img_emb = img_emb / img_emb.norm(p=2, dim=-1, keepdim=True) | |
| img_embeds_list.append(img_emb.cpu()) | |
| img_embeds = torch.cat(img_embeds_list, dim=0).numpy() | |
| index_img = faiss.IndexFlatIP(img_embeds.shape[1]) | |
| index_img.add(img_embeds) | |
| # ------------------- | |
| # Retrieval functions | |
| # ------------------- | |
| def retrieve_captions(image, top_k=5): | |
| img_inputs = processor(images=image, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| img_emb = model.get_image_features(**img_inputs) | |
| img_emb = img_emb / img_emb.norm(p=2, dim=-1, keepdim=True) | |
| img_emb = img_emb.cpu().numpy() | |
| sims, idxs = index_text.search(img_emb, top_k) | |
| results = {captions[i]: float(s) for i, s in zip(idxs[0], sims[0])} | |
| return results | |
| def retrieve_images(text, top_k=3): | |
| txt_inputs = processor(text=[text], return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| txt_emb = model.get_text_features(**txt_inputs) | |
| txt_emb = txt_emb / txt_emb.norm(p=2, dim=-1, keepdim=True) | |
| txt_emb = txt_emb.cpu().numpy() | |
| sims, idxs = index_img.search(txt_emb, top_k) | |
| results = {f"Image {i} (caption: {captions[i][:40]}...)": float(s) for i, s in zip(idxs[0], sims[0])} | |
| return results | |
| # ------------------- | |
| # Gradio UI (two tabs) | |
| # ------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 🔀 CLIP Multimodal Retrieval (FAISS + Flickr8k)\nUpload an image or enter text to retrieve matches from the dataset.") | |
| with gr.Tab("Image → Captions"): | |
| img_input = gr.Image(type="pil") | |
| img_output = gr.Label(num_top_classes=5) | |
| btn1 = gr.Button("Find Captions") | |
| btn1.click(fn=retrieve_captions, inputs=img_input, outputs=img_output) | |
| with gr.Tab("Text → Images"): | |
| txt_input = gr.Textbox(label="Enter a caption") | |
| txt_output = gr.Label(num_top_classes=3) | |
| btn2 = gr.Button("Find Images") | |
| btn2.click(fn=retrieve_images, inputs=txt_input, outputs=txt_output) | |
| if __name__ == "__main__": | |
| demo.launch() | |