File size: 2,010 Bytes
c122b72
 
 
 
 
6a83e1d
c122b72
 
 
 
6a83e1d
eb8c871
 
6a83e1d
eb8c871
6a83e1d
eb8c871
02f82f5
eb8c871
6a83e1d
eb8c871
6a83e1d
eb8c871
6a83e1d
 
c122b72
 
eb8c871
6a83e1d
 
bf2929a
eb8c871
6a83e1d
 
 
 
 
bf2929a
6a83e1d
 
bf2929a
6a83e1d
c122b72
6a83e1d
c122b72
eb8c871
02f82f5
 
eb8c871
02f82f5
6a83e1d
 
 
c122b72
 
6a83e1d
02f82f5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load CLIP model and processor
model_name = "openai/clip-vit-base-patch16"
processor = CLIPProcessor.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name)

# Define matching function
def match_image_with_descriptions(image, descriptions):
    if not image or not descriptions.strip():
        return {"Error": "Please upload an image and enter descriptions."}

    # Split user input into lines
    captions = [line.strip() for line in descriptions.strip().split('\n') if line.strip()]
    
    if len(captions) < 2:
        return {"Error": "Please enter at least two descriptions."}

    # Tokenize inputs
    inputs = processor(text=captions, images=image, return_tensors="pt", padding=True)

    # Run inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get probabilities
    logits_per_image = outputs.logits_per_image  # [1, num_captions]
    probs = logits_per_image.softmax(dim=1)[0]   # shape: [num_captions]

    # Convert to percentage and build result dict
    result_dict = {
        captions[i]: round(probs[i].item() * 100, 2)  # convert to percentage
        for i in range(len(captions))
    }

    # Sort from best match to worst
    sorted_results = dict(sorted(result_dict.items(), key=lambda item: item[1], reverse=True))

    return sorted_results

# Create the Gradio interface
iface = gr.Interface(
    fn=match_image_with_descriptions,
    inputs=[
        gr.Image(type="pil", label="Upload an Image"),
        gr.Textbox(lines=6, placeholder="Enter one description per line...", label="Descriptions")
    ],
    outputs=gr.JSON(label="Match Scores (Sorted by Confidence %)"),
    title="🧠 CLIP Image-Text Matcher (Sorted, %)",
    description="Upload an image and enter multiple captions (one per line). The AI will compare each one and show match scores in % — sorted from best to worst.",
)

# Launch the app
iface.launch()