imgToText / app.py
goldrode's picture
Update app.py
e467978 verified
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image
import gradio as gr
# Load the model, processor, and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("microsoft/vision-encoder-decoder-base")
processor = ViTImageProcessor.from_pretrained("microsoft/vision-encoder-decoder-base")
tokenizer = AutoTokenizer.from_pretrained("microsoft/vision-encoder-decoder-base")
# Function to generate captions
def generate_caption(image):
# Preprocess the image
pixel_values = processor(images=image, return_tensors="pt").pixel_values
# Generate caption
output_ids = model.generate(pixel_values, max_length=16, num_beams=4)
caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return caption
# Gradio interface
interface = gr.Interface(
fn=generate_caption,
inputs=gr.Image(type="pil"),
outputs="text",
title="Image to Text (Caption Generator)",
description="Upload an image, and the AI will describe it!"
)
# Launch the interface
interface.launch()