import streamlit as st from transformers import pipeline from gtts import gTTS from PIL import Image import time #from playsound import playsound #from pydub import AudioSegment #from preferredsoundplayer import soundplay #@st.cache() @st.cache(allow_output_mutation=True) def load_model(): """Retrieves the trained model""" model = pipeline('image-to-text') return model def main(): caption = load_model() st.title("Welcome to image to speech app") instructions = """Click an image using inbuilt camera or upload an image file""" st.write(instructions) img = None pictureCam = st.camera_input("Take a picture") pictureUpload = st.file_uploader('Upload An Image') if pictureCam : st.write('clicked image from webcam') st.image(pictureCam) img = Image.open(pictureCam) elif pictureUpload : st.write('uploaded image from device') st.image(pictureUpload) img = Image.open(pictureUpload) if img is not None : description = caption(img) generated_text = description[0]['generated_text'] st.write(generated_text) generated_audio = gTTS(generated_text) generated_audio.save('demo.mp3') audio_file = open(‘demo.mp3’, ‘rb’) audio_bytes = audio_file.read() st.audio(audio_bytes, format=‘audio/ogg’,start_time=0) #html_string = """ # #""" #sound = st.empty() #sound.markdown(html_string, unsafe_allow_html=True) # will display a st.audio with the sound you specified in the "src" of the html_string and autoplay it #time.sleep(2) # wait for 2 seconds to finish the playing of the audio #sound.empty() # optionally delete the element afterwards #sound = AudioSegment.from_mp3("demo.mp3") #sound.export("demo.wav", format="wav") #soundplay("demo.mp3") #playsound('demo.mp3') #audio_file = open('demo.wav', 'rb') #audio_bytes = audio_file.read() #st.audio(audio_bytes, format='audio/wav') if __name__ == '__main__' : main()