import streamlit as st from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer from PIL import Image # Load the models and tokenizer model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") # Define generation parameters max_length = 20 num_beams = 7 gen_kwargs = {"max_length": max_length, "num_beams": num_beams} # Define the Streamlit app def main(): st.title("Image Captioning App") st.write("Upload an image and get a caption!") uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png", "webp"]) if uploaded_image is not None: st.image(uploaded_image, caption="Uploaded Image.", use_column_width=True) st.write("Generating caption...") # Preprocess the uploaded image image = Image.open(uploaded_image) if image.mode != "RGB": image = image.convert(mode="RGB") # Preprocess the image and generate caption pixel_values = feature_extractor(images=[image], return_tensors="pt").pixel_values output_ids = model.generate(pixel_values, **gen_kwargs) # Decode and display the caption caption = tokenizer.batch_decode(output_ids, skip_special_tokens=True) caption = caption[0].strip() st.write(f"Caption: {caption}") if __name__ == "__main__": st.set_option('deprecation.showfileUploaderEncoding', False) # Disable file uploader encoding warning main()