Update app.py
Browse files
app.py
CHANGED
|
@@ -1,58 +1,57 @@
|
|
| 1 |
-
# 1. Install Required Libraries (run once in terminal or notebook)
|
| 2 |
-
# pip install gradio transformers torch torchvision pillow
|
| 3 |
-
|
| 4 |
-
# 2. Import Libraries
|
| 5 |
import gradio as gr
|
| 6 |
from transformers import CLIPProcessor, CLIPModel
|
| 7 |
from PIL import Image
|
| 8 |
import torch
|
| 9 |
|
| 10 |
-
#
|
| 11 |
model_name = "openai/clip-vit-base-patch16"
|
| 12 |
processor = CLIPProcessor.from_pretrained(model_name)
|
| 13 |
model = CLIPModel.from_pretrained(model_name)
|
| 14 |
|
| 15 |
-
#
|
| 16 |
def match_image_with_descriptions(image, descriptions):
|
| 17 |
if not image or not descriptions.strip():
|
| 18 |
-
return "Please upload an image and enter
|
| 19 |
|
| 20 |
-
#
|
| 21 |
captions = [line.strip() for line in descriptions.strip().split('\n') if line.strip()]
|
| 22 |
|
| 23 |
if len(captions) < 2:
|
| 24 |
-
return "Please enter at least two descriptions
|
| 25 |
|
| 26 |
-
#
|
| 27 |
inputs = processor(text=captions, images=image, return_tensors="pt", padding=True)
|
| 28 |
-
|
| 29 |
-
# Run
|
| 30 |
with torch.no_grad():
|
| 31 |
outputs = model(**inputs)
|
| 32 |
|
| 33 |
-
# Get
|
| 34 |
-
logits_per_image = outputs.logits_per_image #
|
| 35 |
probs = logits_per_image.softmax(dim=1)[0] # shape: [num_captions]
|
| 36 |
|
| 37 |
-
#
|
| 38 |
-
result_dict = {
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
#
|
| 41 |
-
|
| 42 |
|
| 43 |
-
return
|
| 44 |
|
| 45 |
-
#
|
| 46 |
iface = gr.Interface(
|
| 47 |
fn=match_image_with_descriptions,
|
| 48 |
inputs=[
|
| 49 |
gr.Image(type="pil", label="Upload an Image"),
|
| 50 |
gr.Textbox(lines=6, placeholder="Enter one description per line...", label="Descriptions")
|
| 51 |
],
|
| 52 |
-
outputs=gr.
|
| 53 |
-
title="🧠 CLIP Image-Text Matcher",
|
| 54 |
-
description="Upload an image and enter multiple captions (one per line). The AI will compare
|
| 55 |
)
|
| 56 |
|
| 57 |
-
#
|
| 58 |
iface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from transformers import CLIPProcessor, CLIPModel
|
| 3 |
from PIL import Image
|
| 4 |
import torch
|
| 5 |
|
| 6 |
+
# Load CLIP model and processor
|
| 7 |
model_name = "openai/clip-vit-base-patch16"
|
| 8 |
processor = CLIPProcessor.from_pretrained(model_name)
|
| 9 |
model = CLIPModel.from_pretrained(model_name)
|
| 10 |
|
| 11 |
+
# Define matching function
|
| 12 |
def match_image_with_descriptions(image, descriptions):
|
| 13 |
if not image or not descriptions.strip():
|
| 14 |
+
return {"Error": "Please upload an image and enter descriptions."}
|
| 15 |
|
| 16 |
+
# Split user input into lines
|
| 17 |
captions = [line.strip() for line in descriptions.strip().split('\n') if line.strip()]
|
| 18 |
|
| 19 |
if len(captions) < 2:
|
| 20 |
+
return {"Error": "Please enter at least two descriptions."}
|
| 21 |
|
| 22 |
+
# Tokenize inputs
|
| 23 |
inputs = processor(text=captions, images=image, return_tensors="pt", padding=True)
|
| 24 |
+
|
| 25 |
+
# Run inference
|
| 26 |
with torch.no_grad():
|
| 27 |
outputs = model(**inputs)
|
| 28 |
|
| 29 |
+
# Get probabilities
|
| 30 |
+
logits_per_image = outputs.logits_per_image # [1, num_captions]
|
| 31 |
probs = logits_per_image.softmax(dim=1)[0] # shape: [num_captions]
|
| 32 |
|
| 33 |
+
# Convert to percentage and build result dict
|
| 34 |
+
result_dict = {
|
| 35 |
+
captions[i]: round(probs[i].item() * 100, 2) # convert to percentage
|
| 36 |
+
for i in range(len(captions))
|
| 37 |
+
}
|
| 38 |
|
| 39 |
+
# Sort from best match to worst
|
| 40 |
+
sorted_results = dict(sorted(result_dict.items(), key=lambda item: item[1], reverse=True))
|
| 41 |
|
| 42 |
+
return sorted_results
|
| 43 |
|
| 44 |
+
# Create the Gradio interface
|
| 45 |
iface = gr.Interface(
|
| 46 |
fn=match_image_with_descriptions,
|
| 47 |
inputs=[
|
| 48 |
gr.Image(type="pil", label="Upload an Image"),
|
| 49 |
gr.Textbox(lines=6, placeholder="Enter one description per line...", label="Descriptions")
|
| 50 |
],
|
| 51 |
+
outputs=gr.JSON(label="Match Scores (Sorted by Confidence %)"),
|
| 52 |
+
title="🧠 CLIP Image-Text Matcher (Sorted, %)",
|
| 53 |
+
description="Upload an image and enter multiple captions (one per line). The AI will compare each one and show match scores in % — sorted from best to worst.",
|
| 54 |
)
|
| 55 |
|
| 56 |
+
# Launch the app
|
| 57 |
iface.launch()
|