from transformers import ViltProcessor, ViltForQuestionAnswering from PIL import Image import streamlit as st x = st.file_uploader("Upload Images", type=["png","jpg","jpeg"]) if x is not None: st.image(Image.open(x), width=250) i = Image.open(x) question = st.text_input("Enter your question about the image ? ") processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") encoding = processor(i, question, return_tensors="pt") outputs = model(**encoding) logits = outputs.logits idx = logits.argmax(-1).item() st.write("Model replay:", model.config.id2label[idx])