from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
import gradio as gr
# Load the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
def generate_caption(image):
    # Preprocess the image
    inputs = processor(image, return_tensors="pt")

    # Generate text
    with torch.no_grad():
        output = model.generate(**inputs)

    # Decode the generated text
    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption
def interface(image):
    try:
        # Ensure image is a PIL Image
        image = image.convert("RGB")  # Convert to RGB to ensure compatibility
        # Generate caption
        caption = generate_caption(image)
        return caption  # Return only the caption
    except Exception as e:
        return str(e)  # Return error message if any issue occurs

# Create the Gradio interface
iface = gr.Interface(
    fn=interface,
    inputs=gr.Image(type="pil", label="Upload an Image"),  # Input for uploading an image
    outputs=gr.Textbox(label="What image tells???"),  # Output will be the caption
    title="Image Captioning with BLIP",
    description="Upload an image to generate a caption using the BLIP model."
)

# Launch the interface
iface.launch(inline = False)