File size: 1,078 Bytes
e467978 ca0594f 5a65175 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image
import gradio as gr
# Load the model, processor, and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("microsoft/vision-encoder-decoder-base")
processor = ViTImageProcessor.from_pretrained("microsoft/vision-encoder-decoder-base")
tokenizer = AutoTokenizer.from_pretrained("microsoft/vision-encoder-decoder-base")
# Function to generate captions
def generate_caption(image):
# Preprocess the image
pixel_values = processor(images=image, return_tensors="pt").pixel_values
# Generate caption
output_ids = model.generate(pixel_values, max_length=16, num_beams=4)
caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return caption
# Gradio interface
interface = gr.Interface(
fn=generate_caption,
inputs=gr.Image(type="pil"),
outputs="text",
title="Image to Text (Caption Generator)",
description="Upload an image, and the AI will describe it!"
)
# Launch the interface
interface.launch()
|