koey811's picture
Update app.py
9988137 verified
import streamlit as st
from PIL import Image
from transformers import pipeline
import io
import base64
# Set page config
st.set_page_config(page_title="Image to Speech App", layout="wide")
# Title
st.title("Image to Text to Speech Converter")
# Initialize the pipelines
@st.cache_resource
def load_models():
image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
text_to_speech = pipeline("text-to-speech", model="microsoft/speecht5_tts")
return image_to_text, text_to_speech
image_to_text, text_to_speech = load_models()
# Image upload
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
# Display the uploaded image
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded Image", use_column_width=True)
# Extract text from image
with st.spinner("Extracting text from image..."):
result = image_to_text(image)
extracted_text = result[0]["generated_text"]
st.write("Extracted Text:")
st.write(extracted_text)
# Generate speech from text
if st.button("Convert to Speech"):
with st.spinner("Generating speech..."):
speech = text_to_speech(extracted_text)
# Convert speech to base64 for playback
audio_bytes = bytes(speech["audio"])
audio_base64 = base64.b64encode(audio_bytes).decode()
# Create audio player HTML
audio_html = f"""
<audio controls>
<source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
Your browser does not support the audio element.
</audio>
"""
st.markdown(audio_html, unsafe_allow_html=True)
# Add instructions
with st.expander("How to use"):
st.write("""
1. Upload an image containing text using the file uploader
2. Wait for the text to be extracted from the image
3. Click 'Convert to Speech' to generate and play the audio
""")