| import streamlit as st | |
| from transformers import pipeline | |
| from PIL import Image | |
| import torch | |
| from transformers import ViTFeatureExtractor, AutoTokenizer, VisionEncoderDecoderModel | |
| loc = "ydshieh/vit-gpt2-coco-en" | |
| pipeline = pipeline(model=loc) | |
| feature_extractor = ViTFeatureExtractor.from_pretrained(loc) | |
| tokenizer = AutoTokenizer.from_pretrained(loc) | |
| model = VisionEncoderDecoderModel.from_pretrained(loc) | |
| model.eval() | |
| def predict(image): | |
| pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values | |
| with torch.no_grad(): | |
| output_ids = model.generate(pixel_values, max_length=1000, num_beams=4, return_dict_in_generate=True).sequences | |
| preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) | |
| preds = [pred.strip() for pred in preds] | |
| return preds | |
| file_name = st.file_uploader("Upload") | |
| if file_name is not None: | |
| col1, col2 = st.columns(2) | |
| image = Image.open(file_name) | |
| col1.image(image, use_column_width = True) | |
| col2.header("Description") | |
| st.write(predict(image)) |