from PIL import Image import torch from transformers import BlipProcessor, BlipForConditionalGeneration import gradio as gr # Load the processor and model processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") def generate_caption(image): # Preprocess the image inputs = processor(image, return_tensors="pt") # Generate text with torch.no_grad(): output = model.generate(**inputs) # Decode the generated text caption = processor.decode(output[0], skip_special_tokens=True) return caption def interface(image): try: # Ensure image is a PIL Image image = image.convert("RGB") # Convert to RGB to ensure compatibility # Generate caption caption = generate_caption(image) return caption # Return only the caption except Exception as e: return str(e) # Return error message if any issue occurs # Create the Gradio interface iface = gr.Interface( fn=interface, inputs=gr.Image(type="pil", label="Upload an Image"), # Input for uploading an image outputs=gr.Textbox(label="What image tells???"), # Output will be the caption title="Image Captioning with BLIP", description="Upload an image to generate a caption using the BLIP model." ) # Launch the interface iface.launch(inline = False)