ImageToSpeech / app.py
rishikesh's picture
Update app.py
2f6f14e
import streamlit as st
from transformers import pipeline
from gtts import gTTS
from PIL import Image
import time
from playsound import playsound
@st.cache(allow_output_mutation=True)
def load_model():
"""Retrieves the trained model"""
model = pipeline('image-to-text')
return model
def main():
caption = load_model()
st.title('Welcome to image to speech app')
instructions = """Click an image using inbuilt camera or upload an image file"""
st.write(instructions)
img = None
pictureCam = st.camera_input('Take a picture')
pictureUpload = st.file_uploader('Upload An Image')
if pictureCam :
st.write('clicked image from webcam')
st.image(pictureCam)
img = Image.open(pictureCam)
elif pictureUpload :
st.write('uploaded image from device')
st.image(pictureUpload)
img = Image.open(pictureUpload)
if img is not None :
description = caption(img)
generated_text = description[0]['generated_text']
st.write(generated_text)
generated_audio = gTTS(generated_text)
generated_audio.save('demo.mp3')
audio_file = open('demo.mp3', 'rb')
audio_bytes = audio_file.read()
st.audio(audio_bytes, format='audio/ogg',start_time=0)
if __name__ == '__main__' :
main()