import gradio as gr
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load CLIP model and processor
model_name = "openai/clip-vit-base-patch16"
processor = CLIPProcessor.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name)

# Define matching function
def match_image_with_descriptions(image, descriptions):
    if not image or not descriptions.strip():
        return {"Error": "Please upload an image and enter descriptions."}

    # Split user input into lines
    captions = [line.strip() for line in descriptions.strip().split('\n') if line.strip()]
    
    if len(captions) < 2:
        return {"Error": "Please enter at least two descriptions."}

    # Tokenize inputs
    inputs = processor(text=captions, images=image, return_tensors="pt", padding=True)

    # Run inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get probabilities
    logits_per_image = outputs.logits_per_image  # [1, num_captions]
    probs = logits_per_image.softmax(dim=1)[0]   # shape: [num_captions]

    # Convert to percentage and build result dict
    result_dict = {
        captions[i]: round(probs[i].item() * 100, 2)  # convert to percentage
        for i in range(len(captions))
    }

    # Sort from best match to worst
    sorted_results = dict(sorted(result_dict.items(), key=lambda item: item[1], reverse=True))

    return sorted_results

# Create the Gradio interface
iface = gr.Interface(
    fn=match_image_with_descriptions,
    inputs=[
        gr.Image(type="pil", label="Upload an Image"),
        gr.Textbox(lines=6, placeholder="Enter one description per line...", label="Descriptions")
    ],
    outputs=gr.JSON(label="Match Scores (Sorted by Confidence %)"),
    title="🧠 CLIP Image-Text Matcher (Sorted, %)",
    description="Upload an image and enter multiple captions (one per line). The AI will compare each one and show match scores in % — sorted from best to worst.",
)

# Launch the app
iface.launch()