Spaces:
Sleeping
Sleeping
File size: 3,882 Bytes
90bef38 b038974 cd79461 118cd25 7c5a1e4 4e37056 7c5a1e4 b038974 118cd25 cd79461 4e37056 7c5a1e4 4e37056 7c5a1e4 4e37056 7c5a1e4 b038974 7c5a1e4 90bef38 7c5a1e4 90bef38 118cd25 7c5a1e4 b038974 118cd25 cd79461 4e37056 cd79461 b038974 cd79461 1fb1e8e cd79461 b038974 7c5a1e4 90bef38 7c5a1e4 4e37056 90bef38 4e37056 7c5a1e4 4e37056 7c5a1e4 4e37056 7c5a1e4 4e37056 5b9e396 4e37056 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import streamlit as st
from PIL import Image
import os
import tempfile
import sys
# function part
# img2text with a model that doesn't require sentencepiece
def img2text(image_path):
try:
from transformers import pipeline
# Use the Salesforce model instead of Donut to avoid sentencepiece issues
st.info("Using Salesforce/blip-image-captioning-base model for image-to-text")
image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
# Open the image file
image = Image.open(image_path)
# Extract text from the image
result = image_to_text_model(image)
# Get the generated text
text = result[0]["generated_text"] if result else "No text detected"
return text
except Exception as e:
st.error(f"Error processing image: {str(e)}")
return f"Error: {str(e)}"
# text2story
def text2story(text):
# For now, just return the extracted text as the story
story_text = f"Here's a story based on the text: {text}"
return story_text
# text2audio using Google Text-to-Speech
def text2audio(story_text):
try:
from gtts import gTTS
# Create a temporary file
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
temp_audio_path = temp_audio.name
temp_audio.close()
# Initialize gTTS and generate audio
tts = gTTS(text=story_text, lang='en', slow=False)
# Save to the temporary file
tts.save(temp_audio_path)
return temp_audio_path
except Exception as e:
st.error(f"Error generating audio: {str(e)}")
return None
# main part
st.set_page_config(page_title="Your Image to Audio Story",
page_icon="🦜")
st.header("Turn Your Image to Audio Story")
st.subheader("Image to Text to Audio Conversion")
uploaded_file = st.file_uploader("Select an Image...", type=['png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'])
if uploaded_file is not None:
# Save the uploaded file temporarily
bytes_data = uploaded_file.getvalue()
image_temp_path = os.path.join(tempfile.gettempdir(), uploaded_file.name)
with open(image_temp_path, "wb") as file:
file.write(bytes_data)
# Display the uploaded image
st.image(uploaded_file, caption="Uploaded Image",
use_column_width=True)
# Stage 1: Image to Text
with st.spinner('Processing img2text...'):
extracted_text = img2text(image_temp_path)
st.subheader("Extracted Text:")
st.write(extracted_text)
# Stage 2: Text to Story
with st.spinner('Generating a story...'):
story = text2story(extracted_text)
st.subheader("Generated Story:")
st.write(story)
# Stage 3: Story to Audio data
audio_file_path = None
with st.spinner('Generating audio data...'):
audio_file_path = text2audio(story)
# Remove the temporary image file
if os.path.exists(image_temp_path):
os.remove(image_temp_path)
# Play button
if st.button("Play Audio"):
if audio_file_path and os.path.exists(audio_file_path):
# Play the generated audio
with open(audio_file_path, "rb") as audio_file:
audio_bytes = audio_file.read()
st.audio(audio_bytes, format="audio/mp3")
# Clean up the audio file after playing
try:
os.remove(audio_file_path)
except:
pass
else:
st.warning("Audio generation failed. Playing a placeholder audio.")
try:
st.audio("kids_playing_audio.wav")
except FileNotFoundError:
st.error("Placeholder audio file not found. Audio playback is unavailable.") |