import gradio as gr import torch from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration # Device setup device = "cuda" if torch.cuda.is_available() else "cpu" # Load BLIP model and processor processor = BlipProcessor.from_pretrained( "Salesforce/blip-image-captioning-base" ) model = BlipForConditionalGeneration.from_pretrained( "Salesforce/blip-image-captioning-base" ).to(device) def generate_caption(image): if image is None: return "Please upload an image." inputs = processor(images=image, return_tensors="pt").to(device) with torch.no_grad(): output = model.generate(**inputs, max_new_tokens=50) caption = processor.decode(output[0], skip_special_tokens=True) return caption # Gradio Interface interface = gr.Interface( fn=generate_caption, inputs=gr.Image(type="pil", label="Upload Image"), outputs=gr.Textbox(label="Generated Caption"), title="BLIP Image Captioning Demo", description="Upload an image and BLIP will generate a natural language caption." ) interface.launch()