import gradio as gr
import torch
from transformers import pipeline

# 1. Initialize the zero-shot image classification pipeline using CLIP
print("Loading OpenAI CLIP model...")
classifier = pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch32")

def classify_image(image, labels_text):
    # Safe checks for missing inputs
    if image is None or not labels_text.strip():
        return {"Please upload an image and provide labels.": 1.0}
    
    # Clean up the comma-separated labels from the textbox input
    candidate_labels = [label.strip() for label in labels_text.split(",") if label.strip()]
    
    if not candidate_labels:
        return {"Please enter at least one valid label.": 1.0}
    
    # 2. Run inference through CLIP
    # The pipeline automatically coordinates text tokens and image tensors
    predictions = classifier(image, candidate_labels=candidate_labels)
    
    # 3. Format the response dictionary so Gradio's gr.Label can display it
    # Format looks like: {"label_name": score_float}
    return {pred["label"]: float(pred["score"]) for pred in predictions}

# 4. Define the User Interface
demo = gr.Interface(
    fn=classify_image,
    inputs=[
        gr.Image(type="pil", label="1. Upload your Image"),
        gr.Textbox(
            label="2. Candidate Labels (Separate with commas)", 
            placeholder="e.g., a sunny beach, a cozy rainy day, a cute animal, corporate office",
            value="a playful dog, a quiet cat, an outdoor landscape, indoor architecture"
        )
    ],
    outputs=gr.Label(num_top_classes=5, label="Matching Confidence"),
    title="CLIP Zero-Shot Image Matcher",
    description="Type any descriptive phrases or labels you can think of, separate them with commas, and see how well OpenAI's CLIP aligns them to your uploaded photo.",
    flagging_mode="never"
)

# Launch the app
if __name__ == "__main__":
    demo.launch()