import torch from transformers import AutoProcessor, AutoModelForVision2Seq from PIL import Image import gradio as gr # Device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load processor & model processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = AutoModelForVision2Seq.from_pretrained( "Salesforce/blip-image-captioning-large" ).to(device) # Inference function def generate_caption(image): try: image = image.convert("RGB") with torch.inference_mode(): inputs = processor(images=image, return_tensors="pt").to(device) output = model.generate(**inputs) caption = processor.decode(output[0], skip_special_tokens=True) return caption except Exception as e: return f"Error: {str(e)}" # Gradio UI interface = gr.Interface( fn=generate_caption, inputs=gr.Image(type="pil"), outputs="text", title="🖼️ Image to Text Captioning", description="Upload an image and get a caption using BLIP (Salesforce/blip-image-captioning-large)." ) if __name__ == "__main__": interface.launch(share=True)