File size: 3,138 Bytes
82363f6
 
 
7730cd8
2ba3da6
 
82363f6
 
 
 
 
 
 
 
 
 
 
 
 
d4c3cef
 
 
 
 
 
 
 
 
82363f6
 
7730cd8
82363f6
 
753fdd3
db4cdf7
753fdd3
 
 
68c3b1f
753fdd3
7730cd8
753fdd3
7730cd8
753fdd3
 
 
5f79f2b
753fdd3
 
 
 
5f79f2b
 
 
2ba3da6
 
5f79f2b
2ba3da6
 
 
 
 
 
 
 
 
7730cd8
2ba3da6
82363f6
68c3b1f
82363f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7730cd8
 
82363f6
 
7730cd8
 
82363f6
 
 
 
7730cd8
 
 
 
37a58fb
82363f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#Import part
from transformers import pipeline
import streamlit as st
import torch



# Use function for the implementation

# function part
# img2text
def img2text(img):
    image_to_text_model = pipeline("image-to-text", 
                                   model="Salesforce/blip-image-captioning-base")
    text = image_to_text_model(img)[0]["generated_text"]
    return text

# text2story
def text2story(text):
    generator = pipeline("text-generation", model="distilbert/distilgpt2")
    story_text = generator(
        text,
        min_length=150,  # min_length, # of tokens at least larger than100
        max_length=300,  
        num_return_sequences=1,
        no_repeat_ngram_size=2,  # prevent repetition
        early_stopping=False     # prohibit early stopping
    )[0]["generated_text"]
    return story_text


# text2audio
def text2audio(story_text):

    tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-eng")
    audio_data = tts_pipeline(story_text)  # 直接返回字典
    return audio_data
    
    
    # tts_pipeline = pipeline("text-to-speech", model="suno/bark-small")

    # audio_data = tts_pipeline(story_text)

    # audio_buffer = io.BytesIO()
    # wavfile.write(audio_buffer, rate=audio_data["sampling_rate"], data=audio_data["audio"])
    # audio_buffer.seek(0)

    # return {
    #     'audio': audio_buffer.getvalue(),  
    #     'sampling_rate': audio_data["sampling_rate"] 
    # }



    # processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    # model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    
    # inputs = processor(text=story_text, return_tensors="pt")
    # with torch.no_grad():
    #     speech = model.generate(**inputs)

    # audio_data = speech.cpu().numpy().squeeze()
    
    # audio_buffer = io.BytesIO()
    # wavfile.write(audio_buffer, rate=16000, data=audio_data)  # 16kHz 采样率
    # audio_buffer.seek(0)

    # return {'audio': audio_buffer.getvalue(), 'sampling_rate': 16000}


# program main part

st.set_page_config(page_title="Your Image to Audio Story",
                   page_icon="🦜")
st.header("Turn Your Image to Audio Story")
uploaded_file = st.file_uploader("Select an Image...")

if uploaded_file is not None:
    print(uploaded_file)
    bytes_data = uploaded_file.getvalue()
    with open(uploaded_file.name, "wb") as file:
        file.write(bytes_data)
    st.image(uploaded_file, caption="Uploaded Image",
             use_column_width=True)
    

    #Stage 1: Image to Text
    st.text('Processing img2text...')
    scenario = img2text(uploaded_file.name)
    st.write(scenario)

    #Stage 2: Text to Story
    st.text('Generating a story...')
    story = text2story(scenario)
    st.write(story)

    #Stage 3: Story to Audio data
    st.text('Generating audio data...')
    audio_data =text2audio(story)


    # Play button
    if st.button("Play Audio"):
        st.audio(audio_data['audio'],
                   format="audio/wav",
                   start_time=0,
                   sample_rate = audio_data['sampling_rate'])
        #st.audio("kids_playing_audio.wav")