Spaces:
Sleeping
Sleeping
File size: 3,275 Bytes
90bef38 5b9e396 90bef38 b038974 e1ee436 7c5a1e4 b038974 7c5a1e4 b038974 7c5a1e4 90bef38 7c5a1e4 90bef38 7c5a1e4 b038974 7c5a1e4 b038974 7c5a1e4 1fb1e8e 7c5a1e4 b038974 7c5a1e4 90bef38 7c5a1e4 90bef38 7c5a1e4 90bef38 7c5a1e4 5b9e396 7c5a1e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import streamlit as st
from transformers import pipeline
from PIL import Image
import os
# function part
# img2text
def img2text(image_path):
try:
# Load the image-to-text model
image_to_text_model = pipeline("image-to-text", model="naver-clova-ix/donut-base")
# Open the image file
image = Image.open(image_path)
# Extract text from the image
result = image_to_text_model(image)
# Get the generated text
text = result[0]["generated_text"] if result else "No text detected"
return text
except Exception as e:
st.error(f"Error processing image: {str(e)}")
return f"Error: {str(e)}"
# text2story
def text2story(text):
# For now, just return the extracted text as the story
# This function can be expanded later with more sophisticated story generation
story_text = f"Here's a story based on the text: {text}"
return story_text
# text2audio
def text2audio(story_text):
try:
# Load the text-to-speech model (using a common TTS pipeline)
# Note: You may need to install additional dependencies depending on the model used
tts_model = pipeline("text-to-speech", model="espnet/kan-bayashi_ljspeech_vits")
# Generate audio from the story text
audio_data = tts_model(story_text)
return audio_data
except Exception as e:
st.error(f"Error generating audio: {str(e)}")
return None
# main part
st.set_page_config(page_title="Your Image to Audio Story",
page_icon="🦜")
st.header("Turn Your Image to Audio Story")
st.subheader("Using Donut model for text extraction")
uploaded_file = st.file_uploader("Select an Image...", type=['png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp'])
if uploaded_file is not None:
# Save the uploaded file temporarily
bytes_data = uploaded_file.getvalue()
with open(uploaded_file.name, "wb") as file:
file.write(bytes_data)
# Display the uploaded image
st.image(uploaded_file, caption="Uploaded Image",
use_column_width=True)
# Stage 1: Image to Text
with st.spinner('Processing img2text...'):
extracted_text = img2text(uploaded_file.name)
st.subheader("Extracted Text:")
st.write(extracted_text)
# Stage 2: Text to Story
with st.spinner('Generating a story...'):
story = text2story(extracted_text)
st.subheader("Generated Story:")
st.write(story)
# Stage 3: Story to Audio data
with st.spinner('Generating audio data...'):
audio_data = text2audio(story)
# Remove the temporary file
if os.path.exists(uploaded_file.name):
os.remove(uploaded_file.name)
# Play button
if st.button("Play Audio"):
if audio_data:
st.audio(audio_data['audio'],
format="audio/wav",
start_time=0,
sample_rate=audio_data['sampling_rate'])
else:
st.warning("Audio generation failed. Playing a placeholder audio.")
try:
st.audio("kids_playing_audio.wav")
except FileNotFoundError:
st.error("Placeholder audio file not found. Audio playback is unavailable.") |