Spaces:

Steven-GU-Yu-Di
/

ISOM5240-Group4-Project-Version2

Runtime error

File size: 1,478 Bytes

7f1b559
8e585c6
7f1b559
 
 
afdfd46
 
 
 
 
 
7f1b559
 
8e585c6
7f1b559
 
8e585c6
 
 
afdfd46
 
8e585c6
7f1b559
 
8e585c6
7f1b559
 
 
 
8e585c6
7f1b559
8e585c6
 
 
 
afdfd46
 
 
 
8e585c6
 
 
afdfd46

import streamlit as st
from transformers import pipeline
from PIL import Image

# Load the Visual Question Answering (VQA) model
vqa_model = pipeline("text-generation", model="Steven-GU-Yu-Di/Visual-Question-Answering")


# Load the Text-to-Speech (TTS) model
tts = pipeline("text-to-audio", model="Steven-GU-Yu-Di/Text-to-Speech")


# Create a Streamlit app
st.title("Visual Question Answering and Text-to-Speech")

# Sidebar for user inputs
uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
question_input = st.text_input("Enter Question")

# Function to perform Visual Question Answering and Text-to-Speech
def perform_vqa_and_tts(image, question):
    if image is not None and question:
        image = Image.open(image)
        st.image(image, caption="Uploaded Image", use_column_width=True)
        st.write("Question:", question)

        # Visual Question Answering
        vqa_input = {
            "question": question,
            "context": "This is an image.",
        }
        vqa_output = vqa_model(image=image, **vqa_input)
        answer = vqa_output['answer']
        st.write("Answer:", answer)

        # Text-to-Speech using TTS model
        audio_output = tts(answer)
        audio_bytes = audio_output[0]['audio']
        st.audio(audio_bytes, format='audio/wav')

# Button to trigger Visual Question Answering and Text-to-Speech
if st.button("Perform VQA and TTS"):
    perform_vqa_and_tts(uploaded_image, question_input)