import gradio as gr import os from transformers import AutoProcessor, AutoModelForImageTextToText # Load model and processor processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip-image-captioning-base") def caption(image): # 'image' is now a PIL Image object because of type="pil" inputs = processor(image, return_tensors="pt") out = model.generate(**inputs) return processor.decode(out[0], skip_special_tokens=True) demo = gr.Interface( fn=caption, inputs=gr.Image(label="Upload Image", type="pil"), outputs="text" ) demo.launch()