Yatheshr's picture
Update app.py
6a83e1d verified
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
# Load CLIP model and processor
model_name = "openai/clip-vit-base-patch16"
processor = CLIPProcessor.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name)
# Define matching function
def match_image_with_descriptions(image, descriptions):
if not image or not descriptions.strip():
return {"Error": "Please upload an image and enter descriptions."}
# Split user input into lines
captions = [line.strip() for line in descriptions.strip().split('\n') if line.strip()]
if len(captions) < 2:
return {"Error": "Please enter at least two descriptions."}
# Tokenize inputs
inputs = processor(text=captions, images=image, return_tensors="pt", padding=True)
# Run inference
with torch.no_grad():
outputs = model(**inputs)
# Get probabilities
logits_per_image = outputs.logits_per_image # [1, num_captions]
probs = logits_per_image.softmax(dim=1)[0] # shape: [num_captions]
# Convert to percentage and build result dict
result_dict = {
captions[i]: round(probs[i].item() * 100, 2) # convert to percentage
for i in range(len(captions))
}
# Sort from best match to worst
sorted_results = dict(sorted(result_dict.items(), key=lambda item: item[1], reverse=True))
return sorted_results
# Create the Gradio interface
iface = gr.Interface(
fn=match_image_with_descriptions,
inputs=[
gr.Image(type="pil", label="Upload an Image"),
gr.Textbox(lines=6, placeholder="Enter one description per line...", label="Descriptions")
],
outputs=gr.JSON(label="Match Scores (Sorted by Confidence %)"),
title="🧠 CLIP Image-Text Matcher (Sorted, %)",
description="Upload an image and enter multiple captions (one per line). The AI will compare each one and show match scores in % — sorted from best to worst.",
)
# Launch the app
iface.launch()