File size: 2,010 Bytes
c122b72 6a83e1d c122b72 6a83e1d eb8c871 6a83e1d eb8c871 6a83e1d eb8c871 02f82f5 eb8c871 6a83e1d eb8c871 6a83e1d eb8c871 6a83e1d c122b72 eb8c871 6a83e1d bf2929a eb8c871 6a83e1d bf2929a 6a83e1d bf2929a 6a83e1d c122b72 6a83e1d c122b72 eb8c871 02f82f5 eb8c871 02f82f5 6a83e1d c122b72 6a83e1d 02f82f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
# Load CLIP model and processor
model_name = "openai/clip-vit-base-patch16"
processor = CLIPProcessor.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name)
# Define matching function
def match_image_with_descriptions(image, descriptions):
if not image or not descriptions.strip():
return {"Error": "Please upload an image and enter descriptions."}
# Split user input into lines
captions = [line.strip() for line in descriptions.strip().split('\n') if line.strip()]
if len(captions) < 2:
return {"Error": "Please enter at least two descriptions."}
# Tokenize inputs
inputs = processor(text=captions, images=image, return_tensors="pt", padding=True)
# Run inference
with torch.no_grad():
outputs = model(**inputs)
# Get probabilities
logits_per_image = outputs.logits_per_image # [1, num_captions]
probs = logits_per_image.softmax(dim=1)[0] # shape: [num_captions]
# Convert to percentage and build result dict
result_dict = {
captions[i]: round(probs[i].item() * 100, 2) # convert to percentage
for i in range(len(captions))
}
# Sort from best match to worst
sorted_results = dict(sorted(result_dict.items(), key=lambda item: item[1], reverse=True))
return sorted_results
# Create the Gradio interface
iface = gr.Interface(
fn=match_image_with_descriptions,
inputs=[
gr.Image(type="pil", label="Upload an Image"),
gr.Textbox(lines=6, placeholder="Enter one description per line...", label="Descriptions")
],
outputs=gr.JSON(label="Match Scores (Sorted by Confidence %)"),
title="🧠 CLIP Image-Text Matcher (Sorted, %)",
description="Upload an image and enter multiple captions (one per line). The AI will compare each one and show match scores in % — sorted from best to worst.",
)
# Launch the app
iface.launch()
|