from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import torch
import gradio as gr

# Modell laden
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

# Deine Emotion Labels
emotion_labels = [
    "a happy person",
    "a sad person",
    "an angry person",
    "a surprised person",
    "a fearful person",
    "a disgusted person",
    "a neutral person",
    "a contemptuous person",
    "an unknown emotion"
]

# Funktion
def zero_shot_predict(image):
    image = image.convert("RGB")

    inputs = processor(
        text=emotion_labels,
        images=image,
        return_tensors="pt",
        padding=True
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits_per_image = outputs.logits_per_image  # Bild-Text Ähnlichkeiten
    probs = logits_per_image.softmax(dim=1)  # Wahrscheinlichkeiten
    top3_prob, top3_idx = torch.topk(probs, 3)

    # Ergebnisse
    top3 = [(emotion_labels[i], f"{p.item() * 100:.2f}%") for i, p in zip(top3_idx[0], top3_prob[0])]
    best_emotion = emotion_labels[top3_idx[0][0]]

    return best_emotion, top3

# Gradio Interface
interface = gr.Interface(
    fn=zero_shot_predict,
    inputs=gr.Image(type="pil"),
    outputs=["text", gr.Dataframe(headers=["Emotion", "Confidence (%)"])],
    title="Zero-Shot Emotion Recognition",
    description="Erkenne Emotionen ohne Training — einfach mit CLIP!"
)

interface.launch()