import gradio as gr from transformers import CLIPProcessor, CLIPModel from PIL import Image import torch # Load CLIP model and processor model_name = "openai/clip-vit-base-patch16" processor = CLIPProcessor.from_pretrained(model_name) model = CLIPModel.from_pretrained(model_name) # Define matching function def match_image_with_descriptions(image, descriptions): if not image or not descriptions.strip(): return {"Error": "Please upload an image and enter descriptions."} # Split user input into lines captions = [line.strip() for line in descriptions.strip().split('\n') if line.strip()] if len(captions) < 2: return {"Error": "Please enter at least two descriptions."} # Tokenize inputs inputs = processor(text=captions, images=image, return_tensors="pt", padding=True) # Run inference with torch.no_grad(): outputs = model(**inputs) # Get probabilities logits_per_image = outputs.logits_per_image # [1, num_captions] probs = logits_per_image.softmax(dim=1)[0] # shape: [num_captions] # Convert to percentage and build result dict result_dict = { captions[i]: round(probs[i].item() * 100, 2) # convert to percentage for i in range(len(captions)) } # Sort from best match to worst sorted_results = dict(sorted(result_dict.items(), key=lambda item: item[1], reverse=True)) return sorted_results # Create the Gradio interface iface = gr.Interface( fn=match_image_with_descriptions, inputs=[ gr.Image(type="pil", label="Upload an Image"), gr.Textbox(lines=6, placeholder="Enter one description per line...", label="Descriptions") ], outputs=gr.JSON(label="Match Scores (Sorted by Confidence %)"), title="🧠 CLIP Image-Text Matcher (Sorted, %)", description="Upload an image and enter multiple captions (one per line). The AI will compare each one and show match scores in % — sorted from best to worst.", ) # Launch the app iface.launch()