from PIL import Image from transformers import CLIPProcessor, CLIPModel import torch import gradio as gr # Modell laden model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) model.eval() # Deine Emotion Labels emotion_labels = [ "a happy person", "a sad person", "an angry person", "a surprised person", "a fearful person", "a disgusted person", "a neutral person", "a contemptuous person", "an unknown emotion" ] # Funktion def zero_shot_predict(image): image = image.convert("RGB") inputs = processor( text=emotion_labels, images=image, return_tensors="pt", padding=True ).to(device) with torch.no_grad(): outputs = model(**inputs) logits_per_image = outputs.logits_per_image # Bild-Text Ähnlichkeiten probs = logits_per_image.softmax(dim=1) # Wahrscheinlichkeiten top3_prob, top3_idx = torch.topk(probs, 3) # Ergebnisse top3 = [(emotion_labels[i], f"{p.item() * 100:.2f}%") for i, p in zip(top3_idx[0], top3_prob[0])] best_emotion = emotion_labels[top3_idx[0][0]] return best_emotion, top3 # Gradio Interface interface = gr.Interface( fn=zero_shot_predict, inputs=gr.Image(type="pil"), outputs=["text", gr.Dataframe(headers=["Emotion", "Confidence (%)"])], title="Zero-Shot Emotion Recognition", description="Erkenne Emotionen ohne Training — einfach mit CLIP!" ) interface.launch()