import gradio as gr from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSequenceClassification from PIL import Image import torch import torch.nn.functional as F # Load BLIP model and processor for image captioning blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # Load emotion classification model and tokenizer emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base") emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base") def detect_emotion_caption(image): # Generate caption using BLIP inputs = blip_processor(image, return_tensors="pt") with torch.no_grad(): output = blip_model.generate(**inputs) caption = blip_processor.decode(output[0], skip_special_tokens=True) # Predict emotion from caption inputs = emotion_tokenizer(caption, return_tensors="pt") with torch.no_grad(): logits = emotion_model(**inputs).logits probs = F.softmax(logits, dim=1) predicted_class = torch.argmax(probs, dim=1).item() emotion = emotion_model.config.id2label[predicted_class] return f"Caption: {caption}\nDetected Emotion: {emotion}" # Gradio interface iface = gr.Interface( fn=detect_emotion_caption, inputs=gr.Image(type="pil"), outputs="text", title="Image Emotion Detection", description="Upload an image. The app will generate a caption and detect the associated emotion." ) if __name__ == "__main__": iface.launch()