|
|
import gradio as gr |
|
|
from transformers import CLIPProcessor, CLIPModel |
|
|
from PIL import Image |
|
|
import torch |
|
|
|
|
|
|
|
|
model_name = "openai/clip-vit-base-patch16" |
|
|
processor = CLIPProcessor.from_pretrained(model_name) |
|
|
model = CLIPModel.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
def match_image_with_descriptions(image, descriptions): |
|
|
if not image or not descriptions.strip(): |
|
|
return {"Error": "Please upload an image and enter descriptions."} |
|
|
|
|
|
|
|
|
captions = [line.strip() for line in descriptions.strip().split('\n') if line.strip()] |
|
|
|
|
|
if len(captions) < 2: |
|
|
return {"Error": "Please enter at least two descriptions."} |
|
|
|
|
|
|
|
|
inputs = processor(text=captions, images=image, return_tensors="pt", padding=True) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
|
|
|
|
|
|
logits_per_image = outputs.logits_per_image |
|
|
probs = logits_per_image.softmax(dim=1)[0] |
|
|
|
|
|
|
|
|
result_dict = { |
|
|
captions[i]: round(probs[i].item() * 100, 2) |
|
|
for i in range(len(captions)) |
|
|
} |
|
|
|
|
|
|
|
|
sorted_results = dict(sorted(result_dict.items(), key=lambda item: item[1], reverse=True)) |
|
|
|
|
|
return sorted_results |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=match_image_with_descriptions, |
|
|
inputs=[ |
|
|
gr.Image(type="pil", label="Upload an Image"), |
|
|
gr.Textbox(lines=6, placeholder="Enter one description per line...", label="Descriptions") |
|
|
], |
|
|
outputs=gr.JSON(label="Match Scores (Sorted by Confidence %)"), |
|
|
title="🧠 CLIP Image-Text Matcher (Sorted, %)", |
|
|
description="Upload an image and enter multiple captions (one per line). The AI will compare each one and show match scores in % — sorted from best to worst.", |
|
|
) |
|
|
|
|
|
|
|
|
iface.launch() |
|
|
|