ImageToSpeech / app.py
rishikesh's picture
Update app.py
f090191
raw
history blame
2.03 kB
import streamlit as st
from transformers import pipeline
from gtts import gTTS
from PIL import Image
import time
#from playsound import playsound
#from pydub import AudioSegment
#from preferredsoundplayer import soundplay
#@st.cache()
@st.cache(allow_output_mutation=True)
def load_model():
"""Retrieves the trained model"""
model = pipeline('image-to-text')
return model
def main():
caption = load_model()
st.title("Welcome to image to speech app")
instructions = """Click an image using inbuilt camera
or upload an image file"""
st.write(instructions)
img = None
pictureCam = st.camera_input("Take a picture")
pictureUpload = st.file_uploader('Upload An Image')
if pictureCam :
st.write('clicked image from webcam')
st.image(pictureCam)
img = Image.open(pictureCam)
elif pictureUpload :
st.write('uploaded image from device')
st.image(pictureUpload)
img = Image.open(pictureUpload)
if img is not None :
description = caption(img)
generated_text = description[0]['generated_text']
st.write(generated_text)
generated_audio = gTTS(generated_text)
generated_audio.save('demo.mp3')
audio_file = open(‘demo.mp3’, ‘rb’)
audio_bytes = audio_file.read()
st.audio(audio_bytes, format=‘audio/ogg’,start_time=0)
#html_string = """
#<audio controls autoplay>
# <source src="demo.mp3" type="audio/mp3">
#</audio>
#"""
#sound = st.empty()
#sound.markdown(html_string, unsafe_allow_html=True) # will display a st.audio with the sound you specified in the "src" of the html_string and autoplay it
#time.sleep(2) # wait for 2 seconds to finish the playing of the audio
#sound.empty() # optionally delete the element afterwards
#sound = AudioSegment.from_mp3("demo.mp3")
#sound.export("demo.wav", format="wav")
#soundplay("demo.mp3")
#playsound('demo.mp3')
#audio_file = open('demo.wav', 'rb')
#audio_bytes = audio_file.read()
#st.audio(audio_bytes, format='audio/wav')
if __name__ == '__main__' :
main()