from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer from PIL import Image import gradio as gr # Load the model, processor, and tokenizer model = VisionEncoderDecoderModel.from_pretrained("microsoft/vision-encoder-decoder-base") processor = ViTImageProcessor.from_pretrained("microsoft/vision-encoder-decoder-base") tokenizer = AutoTokenizer.from_pretrained("microsoft/vision-encoder-decoder-base") # Function to generate captions def generate_caption(image): # Preprocess the image pixel_values = processor(images=image, return_tensors="pt").pixel_values # Generate caption output_ids = model.generate(pixel_values, max_length=16, num_beams=4) caption = tokenizer.decode(output_ids[0], skip_special_tokens=True) return caption # Gradio interface interface = gr.Interface( fn=generate_caption, inputs=gr.Image(type="pil"), outputs="text", title="Image to Text (Caption Generator)", description="Upload an image, and the AI will describe it!" ) # Launch the interface interface.launch()