File size: 1,478 Bytes
7f1b559
8e585c6
7f1b559
 
 
afdfd46
 
 
 
 
 
7f1b559
 
8e585c6
7f1b559
 
8e585c6
 
 
afdfd46
 
8e585c6
7f1b559
 
8e585c6
7f1b559
 
 
 
8e585c6
7f1b559
8e585c6
 
 
 
afdfd46
 
 
 
8e585c6
 
 
afdfd46
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import streamlit as st
from transformers import pipeline
from PIL import Image

# Load the Visual Question Answering (VQA) model
vqa_model = pipeline("text-generation", model="Steven-GU-Yu-Di/Visual-Question-Answering")


# Load the Text-to-Speech (TTS) model
tts = pipeline("text-to-audio", model="Steven-GU-Yu-Di/Text-to-Speech")


# Create a Streamlit app
st.title("Visual Question Answering and Text-to-Speech")

# Sidebar for user inputs
uploaded_image = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
question_input = st.text_input("Enter Question")

# Function to perform Visual Question Answering and Text-to-Speech
def perform_vqa_and_tts(image, question):
    if image is not None and question:
        image = Image.open(image)
        st.image(image, caption="Uploaded Image", use_column_width=True)
        st.write("Question:", question)

        # Visual Question Answering
        vqa_input = {
            "question": question,
            "context": "This is an image.",
        }
        vqa_output = vqa_model(image=image, **vqa_input)
        answer = vqa_output['answer']
        st.write("Answer:", answer)

        # Text-to-Speech using TTS model
        audio_output = tts(answer)
        audio_bytes = audio_output[0]['audio']
        st.audio(audio_bytes, format='audio/wav')

# Button to trigger Visual Question Answering and Text-to-Speech
if st.button("Perform VQA and TTS"):
    perform_vqa_and_tts(uploaded_image, question_input)