import streamlit as st
from transformers import pipeline
from gtts import gTTS 
from PIL import Image
import time
#from playsound import playsound
#from pydub import AudioSegment
#from preferredsoundplayer import soundplay

#@st.cache()
@st.cache(allow_output_mutation=True)
def load_model():
    """Retrieves the trained model"""
    model = pipeline('image-to-text')
    return model

def main():
	caption = load_model()
	st.title("Welcome to image to speech app")
	instructions = """Click an image using inbuilt camera 
	or upload an image file"""
	st.write(instructions)

	img = None
	pictureCam = st.camera_input("Take a picture")
	pictureUpload = st.file_uploader('Upload An Image')

	if pictureCam :
		st.write('clicked image from webcam')
		st.image(pictureCam)
		img = Image.open(pictureCam)
	elif pictureUpload : 
		st.write('uploaded image from device')
		st.image(pictureUpload)
		img = Image.open(pictureUpload)

	if img is not None : 
		description = caption(img)
		generated_text = description[0]['generated_text']
		st.write(generated_text)
		generated_audio = gTTS(generated_text)
		generated_audio.save('demo.mp3')

		audio_file = open(‘demo.mp3’, ‘rb’)
		audio_bytes = audio_file.read()
		st.audio(audio_bytes, format=‘audio/ogg’,start_time=0)

		#html_string = """
        #<audio controls autoplay>
        #  <source src="demo.mp3" type="audio/mp3">
        #</audio>
        #"""
		#sound = st.empty()
		#sound.markdown(html_string, unsafe_allow_html=True)  # will display a st.audio with the sound you specified in the "src" of the html_string and autoplay it
		#time.sleep(2)  # wait for 2 seconds to finish the playing of the audio
		#sound.empty()  # optionally delete the element afterwards

		#sound = AudioSegment.from_mp3("demo.mp3")
		#sound.export("demo.wav", format="wav")
		#soundplay("demo.mp3")
		#playsound('demo.mp3')
		#audio_file = open('demo.wav', 'rb')
		#audio_bytes = audio_file.read()
		#st.audio(audio_bytes, format='audio/wav')


if __name__ == '__main__' : 
	main()