File size: 2,019 Bytes
a4d3132
 
 
 
 
c63d328
a4d3132
 
16cda9f
a4d3132
 
 
c63d328
a4d3132
 
1eebd1b
5f39df4
6be9290
04fe24c
 
5f39df4
733173b
c63d328
 
a4d3132
74fc286
c63d328
a4d3132
 
 
 
 
 
c63d328
a4d3132
932211d
 
 
 
40d2871
c63d328
 
 
 
 
 
 
 
e35efdf
 
 
c63d328
 
 
8a1f685
 
c63d328
f966229
86cd1ed
c63d328
d46ae3c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# import part
import streamlit as st
from transformers import pipeline

# function part
# image2text
def img2text(img):
    image_to_text_model = pipeline("image-to-text",
                                   model="nlpconnect/vit-gpt2-image-captioning")
    text = image_to_text_model(img)[0]["generated_text"]
    return text

# text2story
def text2story(text):
    text_generation_model = pipeline("text-generation",
                                   model="openai-community/gpt2")
    story_text = f"Once upon a time in a land far, far away, {text}"
    generated_story = text_generation_model(story_text,
                            max_length=100,
                            num_return_sequences=1)
    return generated_story[0]['generated_text']
        
# text2audio
def text2audio(story_text):
    text_to_speech_model = pipeline("text-to-speech", model="facebook/mms-tts-eng")
    speech_output = text_to_speech_model(story_text)
    return speech_output

# main part
st.set_page_config(page_title="Your Image to Audio Story",
                    page_icon="*")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...", type=["jpg", "png", "jpeg"])

if uploaded_file is not None:
    print(uploaded_file)
    bytes_data = uploaded_file.getvalue()
    with open(uploaded_file.name, "wb") as file:
      file.write(bytes_data)
    st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)

    # stage 1
    st.text('Processing img2text...')
    scenario = img2text(uploaded_file.name)
    st.write(scenario)

     # stage 2
    st.text('Generating a story...')
    generated_story = text2story(scenario)  
    # Use the scenario from img2text
    st.write(generated_story)

     # stage 3
    st.text('Generating audio data...')
    audio_data = text2audio(generated_story)

if st.button("Play Audio"):
    st.audio(audio_data['audio'], 
             format="audio/wav",
             start_time=0,
             sample_rate=audio_data['sampling_rate'])