import gradio as gr
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP model and processor
processor = BlipProcessor.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)

model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
).to(device)

def generate_caption(image):
    if image is None:
        return "Please upload an image."

    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=50)

    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption

# Gradio Interface
interface = gr.Interface(
    fn=generate_caption,
    inputs=gr.Image(type="pil", label="Upload Image"),
    outputs=gr.Textbox(label="Generated Caption"),
    title="BLIP Image Captioning Demo",
    description="Upload an image and BLIP will generate a natural language caption."
)

interface.launch()