Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from transformers import pipeline | |
| from PIL import Image | |
| # Load the Visual Question Answering (VQA) model | |
| vqa_model = pipeline("text-generation", model="Steven-GU-Yu-Di/Visual-Question-Answering") | |
| # Load the Text-to-Speech (TTS) model | |
| tts = pipeline("text-to-audio", model="Steven-GU-Yu-Di/Text-to-Speech") | |
| # Create a Streamlit app | |
| st.title("Visual Question Answering and Text-to-Speech") | |
| # Sidebar for user inputs | |
| uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"]) | |
| question_input = st.text_input("Enter Question") | |
| # Function to perform Visual Question Answering and Text-to-Speech | |
| def perform_vqa_and_tts(image, question): | |
| if image is not None and question: | |
| image = Image.open(image) | |
| st.image(image, caption="Uploaded Image", use_column_width=True) | |
| st.write("Question:", question) | |
| # Visual Question Answering | |
| vqa_input = { | |
| "question": question, | |
| "context": "This is an image.", | |
| } | |
| vqa_output = vqa_model(image=image, **vqa_input) | |
| answer = vqa_output['answer'] | |
| st.write("Answer:", answer) | |
| # Text-to-Speech using TTS model | |
| audio_output = tts(answer) | |
| audio_bytes = audio_output[0]['audio'] | |
| st.audio(audio_bytes, format='audio/wav') | |
| # Button to trigger Visual Question Answering and Text-to-Speech | |
| if st.button("Perform VQA and TTS"): | |
| perform_vqa_and_tts(uploaded_image, question_input) | |